# Imports

In [58]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

# Path and Load Data 

In [59]:
data = pd.read_csv('politics_[actual_size=4968]_processed_merge.csv').politics
data = pd.DataFrame(data=data)
data.head() 

Unnamed: 0,politics
0,"['scott', 'pruitt', ""trump'"", 'former', 'epa',..."
1,"['joint', 'fundrais', 'committe', 'run', 'marj..."
2,"['earliest', 'day', 'black', 'church', 'polit'..."
3,"['earli', 'novemb', 'rep', 'chip', 'roy', 'tex..."
4,"['add', 'west', 'wing', 'playbook', 'daili', '..."


In [60]:
politics_positive_tweets = data[:4968] 
print('politics tweets size :', len(politics_positive_tweets))
politics_negative_tweets = data[4968:] 
print('non-politics tweets size :', len(politics_negative_tweets))

politics tweets size : 4968
non-politics tweets size : 5000


* Train test split: 20% will be in the test set, and 80% in the training set.


In [61]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_pos = politics_positive_tweets[4000:]
train_pos = politics_positive_tweets[:4000]

test_neg = politics_negative_tweets[4000:]
train_neg = politics_negative_tweets[:4000]

In [77]:
train_x = pd.concat([train_pos, train_neg], axis=0) 
test_x = pd.concat([test_pos, test_neg], axis=0)

- Create the numpy array of positive labels and negative labels.

In [78]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [79]:
print('train shaoe:', train_y.shape)
print('test shape:', test_y.shape)

train shaoe: (8000, 1)
test shape: (1968, 1)


# Extract Vocabulary for Politics Dataset 

In [80]:
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets that're already pre-processed 
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    yslist = np.squeeze(ys).tolist()
    
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in tweet:
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [92]:
freqs = build_freqs(train_x.politics, train_y)

In [100]:
for y, tweet in zip(np.squeeze(train_y).tolist(), train_x.politics.to_list()):
    print('type', type(tweet))
    print(tweet)

type <class 'str'>
['scott', 'pruitt', "trump'", 'former', 'epa', 'chief', 'run', 'u', 'senat', 'oklahoma', 'year']
type <class 'str'>
['joint', 'fundrais', 'committe', 'run', 'marjori', 'taylor', 'green', 'matt', 'gaetz', 'paid', 'john', "eastman'", 'law', 'firm', 'least', '00', '‚Ä¶']
type <class 'str'>
['earliest', 'day', 'black', 'church', 'polit', 'necess', '‚Äî', 'c', 'except', 'last', 'two', 'decad', 'c', '‚Äô', 'black', 'church', 'lose', 'congreg', 'gentrif', 'pastor', 'push', 'chang']
type <class 'str'>
['earli', 'novemb', 'rep', 'chip', 'roy', 'text', 'mark', 'meadow', 'say', '‚Äú', 'need', 'ammo', '‚Äù', 'former', 'presid', 'donald', "trump'", 'effort', '‚Ä¶']
type <class 'str'>
['add', 'west', 'wing', 'playbook', 'daili', 'read', 'newsi', 'nugget', 'detail', 'find', 'anywher', 'els', 'biden', 'administr', 'subscrib', 'today']
type <class 'str'>
["we'r", 'live', 'report', 'discuss', 'inclus', 'climat', 'action', 'movement', 'join', 'us', 'üëá']
type <class 'str'>
['join', '

type <class 'str'>
['new', 'york', 'green', 'light', 'massiv', 'renew', 'energi', 'project', 'cut', 'fossil', 'fuel', 'relianc']
type <class 'str'>
['new', 'zelenski', 'say', 'world', 'prepar', 'russia', 'use', 'nuclear', 'weapon']
type <class 'str'>
['russia', 'accus', 'ukrainian', 'forc', 'shell', 'russian', 'villag']
type <class 'str'>
['teen', 'charg', 'hate', 'crime', 'attack', 'sikh', 'man']
type <class 'str'>
['nasa', '‚Äô', 'mar', 'helicopt', 'set', 'new', 'flight', 'record']
type <class 'str'>
['nj', 'governor', 'order', 'review', 'new', 'sex', 'ed', 'standard', 'amid', 'backlash']
type <class 'str'>
['feder', 'judg', 'rule', 'abram', 'cannot', 'use', 'committe', 'rais', 'unlimit', 'fund']
type <class 'str'>
['donald', 'trump', 'gop', 'member', 'slavishli', 'report', 'like', 'sycoph', 'said', '...', 'want', 'investig', 'know', 'minor', 'leader', 'walk', 'back', 'pull', 'plug', 'independ', 'jan', '6', 'commiss']
type <class 'str'>
['new', 'jersey', 'soon', 'kick', 'recreat', 'm

['report', 'std', 'case', 'surg', 'late', '2020', 'drop', 'earli', 'pandem', 'data', 'suggest']
type <class 'str'>
['gop', 'women', '‚Äô', 'group', 'relat', 'group', 'report', '6', 'million', 'first', 'quarter', 'fundrais', 'haul']
type <class 'str'>
['us', 'airport', 'top', 'rank', 'world', '‚Äô', 'busiest', 'hub', '2021']
type <class 'str'>
['outgo', 'ag', 'session', 'tout', 'doj', 'achiev', 'farewel', 'messag', 'colleagu', 'via']
type <class 'str'>
['fox', 'news', 'exec', 'condemn', '‚Äò', 'reprehens', '‚Äô', 'threat', 'made', 'outsid', 'home', 'tucker', 'carlson', 'via']
type <class 'str'>
['halftim', 'report', 'florida', 'race', 'get', 'tight', 'tight', 'tight', 'via']
type <class 'str'>
['ex-new', 'york', 'attorney', 'gener', 'eric', 'scheiderman', 'face', 'crimin', 'charg', 'physic', 'abus', 'probe', 'via']
type <class 'str'>
['georgia', 'elect', 'fight', 'heat', 'kemp', 'declar', 'victori', 'resign', 'secretari', 'post', 'abram', 'dig', 'via']
type <class 'str'>
['trump', 'kava

type <class 'str'>
['bolsonaro', 'irk', 'whatsapp', 'launch', 'new', 'tool', 'brazil', 'elect']
type <class 'str'>
['biden', 'nomin', 'former', 'treasuri', 'offici', 'barr', 'fed', 'top', 'regulatori', 'job']
type <class 'str'>
['u', 'deleg', 'discuss', 'migrat', 'panama', 'next', 'week']
type <class 'str'>
['explain', 'michael', 'barr', 'face', 'long', 'to-do', 'list', 'u', "fed'", 'next', 'wall', 'street', 'cop']
type <class 'str'>
['trucker', 'argentina', 'end', 'protest', 'threaten', 'grain', 'transport']
type <class 'str'>
['biden', 'nomin', 'former', 'treasuri', 'offici', 'barr', 'fed', 'top', 'regulatori', 'job']
type <class 'str'>
['u', 'deleg', 'discuss', 'migrat', 'panama', 'next', 'week']
type <class 'str'>
['explain', 'michael', 'barr', 'face', 'long', 'to-do', 'list', 'u', "fed'", 'next', 'wall', 'street', 'cop']
type <class 'str'>
['trucker', 'argentina', 'end', 'protest', 'threaten', 'grain', 'transport', 'ministri']
type <class 'str'>
['promin', 'democrat', 'parti', 'ac

type <class 'str'>
['prosecutor', 'unseal', 'indict', 'charg', 'member', "russia'", 'legislatur', 'two', 'staffer', 'orchestr', 'propaganda', 'disinform', 'campaign', 'target', 'us', 'lawmak']
type <class 'str'>
['coalit', 'civil', 'right', 'group', 'file', 'lawsuit', 'galveston', 'counti', 'texa', 'alleg', "county'", 'redistrict', 'plan', 'intent', 'discrimin', 'grow', 'minor', 'popul']
type <class 'str'>
["rnc'", 'decis', 'withdraw', 'particip', 'organ', 'long', 'manag', 'presidenti', 'debat', 'former', 'presid', 'donald', "trump'", 'fingerprint', 'analysi', "cnn'", 'chri', 'cillizza']
type <class 'str'>
['break', 'man', 'said', 'want', 'earn', 'former', 'presid', 'donald', "trump'", 'respect', 'us', 'capitol', 'riot', 'found', 'guilti', 'charg', 'juri', 'reject', 'blame', 'trump', 'defens']
type <class 'str'>
['vice', 'presid', 'kamala', 'harri', 'second', 'gentleman', 'doug', 'emhoff', 'hold', 'first', 'known', 'passov', 'seder', 'vice', "president'", 'resid']
type <class 'str'>
['

['matzo', 'ball', 'soup', 'get', 'glow-up', 'fresh', 'parsley', 'dill', 'chive', 'fennel', 'frond']
type <class 'str'>
['start', 'weekend', 'right']
type <class 'str'>
['custom', 'line', 'icon', 'hot', 'dog', 'roy', "choi'", 'kogi', 'truck', 'la']
type <class 'str'>
['thirti', 'year', 'ago', 'arizona', 'ice', 'tea', '99', 'cent', '‚Äî', 'today', 'cost']
type <class 'str'>
['char', 'cabbag', 'unexpect', 'side', 'want', 'stop', 'make']
type <class 'str'>
['proof', 'dog', 'realli', "man'", 'best', 'friend']
type <class 'str'>
['easi', 'kitchen', 'hack', 'know', 'need']
type <class 'str'>
['even', 'bake', 'cake', 'wine']
type <class 'str'>
['crunchi', 'fri', 'chickpea', 'replac', 'crouton', 'simpl', 'kale', 'caesar', 'salad']
type <class 'str'>
['3', 'breweri', 'entir', 'woman-own']
type <class 'str'>
['one', 'chef', 'mission', 'chang', 'mind', 'pineappl', 'pizza']
type <class 'str'>
["world'", 'oldest', 'whiskey', 'goe', '000', 'bottl']
type <class 'str'>
['new', 'fridg', 'invest', 'make'

type <class 'str'>
['thewalkingdead', 'star', 'rossmarquand', 'exclus', 'reveal', 'part', "tv'", 'lgbtq', 'represent', 'meant', 'okexclus']
type <class 'str'>
['melissariv', 'gave', 'emot', 'updat', 'tough', 'adopt', 'journey', "i'v", 'cri', 'lot']
type <class 'str'>
['unstopp', 'kelseaballerini', 'creat', 'countri', 'star', 'glam', 'makeup', 'look', 'home', 'cmtaward', '‚Äî', 'shop', 'red-carpet', 'readi', 'home', 'style']
type <class 'str'>
['kaleycuoco', 'readi', 'hit', 'date', 'scene', 'actress', 'dish', 'statu', 'love', 'life', 'divorc', 'karlcook', 'last', 'year']
type <class 'str'>
['troubl', 'paradis', 'üëÄ', 'meganfox', 'seem', 'beyond', 'annoy', 'fianc√©', 'machinegunkelli', 'dodg', 'red', 'carpet', 'viral', 'video']
type <class 'str'>
['new', 'detail', 'intim', 'thing', 'princeandrew', 'hole', 'bedroom', 'two', 'day', 'gold', 'trip', 'reveal']
type <class 'str'>
['scarlettjohansson', 'set', 'record', 'straight', 'wild', 'rumor']
type <class 'str'>
['tyrabank', 'confirm', '‚

type <class 'str'>
['wire', 'üò≥']
type <class 'str'>
['got', 'make', 'instant', 'classic', 'üçø']
type <class 'str'>
['fair', 'üò≠']
type <class 'str'>
['one', 'age', '...']
type <class 'str'>
['jayhawk', 'ship', '‚Äº', 'Ô∏è', 'down', 'nova', 'advanc', '10th', 'titl', 'game']
type <class 'str'>
['kansa', 'doubl', 'digit', 'half', 'üëÄ']
type <class 'str'>
['kansa', 'wore', 'titl', 'ix', 'shirt', "tonight'", 'final', 'four', 'game', '‚¨á', 'Ô∏è']
type <class 'str'>
['noth', 'new', 'suni', 'lee', 'ü§©', 'score', 'perfect', 'üîü', 'beam']
type <class 'str'>
['‚Äú', '‚Äô', 'nipsey', '‚Äù', 'three', 'year', 'ago', 'today', 'russel', 'westbrook', 'dedic', '20-20-', '20', 'game', 'nipsey', 'hussl', 'üèÅ']
type <class 'str'>
['three', 'win', 'row', '‚Äº', 'Ô∏è']
type <class 'str'>
["year'", 'freshman', 'class', 'stack', 'nba', 'talent', 'üìà', 'crew', 'take', 'look', 'worth', 'look', 'next', 'level', 'üëÄ']
type <class 'str'>
['villanova-kansa', 'unc-duk', 'break', 'everyth', "men'",