# Imports

In [312]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from scipy.sparse import coo_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

import ast

# Path and Load Data 

In [313]:
data = pd.read_csv('politics_[actual_size=4968]_processed_merge.csv').politics
data = pd.DataFrame(data=data)
data.head() 

Unnamed: 0,politics
0,"['scott', 'pruitt', ""trump'"", 'former', 'epa',..."
1,"['joint', 'fundrais', 'committe', 'run', 'marj..."
2,"['earliest', 'day', 'black', 'church', 'polit'..."
3,"['earli', 'novemb', 'rep', 'chip', 'roy', 'tex..."
4,"['add', 'west', 'wing', 'playbook', 'daili', '..."


In [314]:
politics_positive_tweets = data[:4968] 
print('politics tweets size :', len(politics_positive_tweets))
politics_negative_tweets = data[4968:] 
print('non-politics tweets size :', len(politics_negative_tweets))

politics tweets size : 4968
non-politics tweets size : 5000


* Train test split: 20% will be in the test set, and 80% in the training set.


In [315]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_pos = politics_positive_tweets[4000:]
train_pos = politics_positive_tweets[:4000]

test_neg = politics_negative_tweets[4000:]
train_neg = politics_negative_tweets[:4000]

In [316]:
train_x = pd.concat([train_pos, train_neg], axis=0) 
test_x = pd.concat([test_pos, test_neg], axis=0)

- Create the numpy array of positive labels and negative labels.

In [317]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [318]:
print('train shaoe:', train_y.shape)
print('test shape:', test_y.shape)

train shaoe: (8000, 1)
test shape: (1968, 1)


# Vocabulary for Politics Dataset 

In [319]:
def generate_tweets_list(dataframe_single):
    train_x_arr = [] 
    for index, row in dataframe_single.iterrows(): 
        for a in row.values: 
            train_x_arr.append(ast.literal_eval(a))
            
    return train_x_arr

In [320]:
def build_freqs(train_x, train_y):
    
    yslist = np.squeeze(train_y).tolist()
    train_x_arr = generate_tweets_list(train_x)
    
    freqs = {}
    for y, tweet in zip(yslist, train_x_arr):
        for word in tweet:
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [321]:
freqs = build_freqs(train_x, train_y) 
train_x_arr = generate_tweets_list(train_x)

In [322]:
len(freqs)

13201

## Extract Features 
* Given a list of tweets, extract the features and store them in a matrix. Extract two features.
    * The first feature is the number of positive words in a tweet.
    * The second feature is the number of negative words in a tweet. 
* Then train the logistic regression classifier on these features.
* Test the classifier on a validation set. 

In [323]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet 
        freqs: a dictionary corresponding to the frequencies of each tuple 
    Output:
        x: a feature vector of dimension (1, 3)
    '''
    x = np.zeros((1, 3)) 
    x[0,0] = 1 # bias term  
    
    for word in tweet: 
        
        x[0,1] += freqs.get((word, 1.0), 0) 
        
        x[0,2] += freqs.get((word, 0.0), 0) 
        
    assert(x.shape == (1, 3))
    return x 

In [324]:
# Ex 
train_x_arr[0]

['scott',
 'pruitt',
 "trump'",
 'former',
 'epa',
 'chief',
 'run',
 'u',
 'senat',
 'oklahoma',
 'year']

In [325]:
extract_features(train_x_arr[0], freqs)

array([[1.000e+00, 1.169e+03, 2.280e+02]])

In [326]:
extract_features(['asd', 'asd', 'asd'], freqs)

array([[1., 0., 0.]])

In [327]:
X = np.zeros((len(train_x_arr), 3))
for i in range(len(train_x_arr)):
    X[i, :]= extract_features(train_x_arr[i], freqs) 
    
X[:5]

array([[1.000e+00, 1.169e+03, 2.280e+02],
       [1.000e+00, 5.070e+02, 1.930e+02],
       [1.000e+00, 1.379e+03, 7.950e+02],
       [1.000e+00, 1.360e+03, 4.560e+02],
       [1.000e+00, 1.024e+03, 2.760e+02]])

In [328]:
y = train_y

In [329]:
X_sparse = coo_matrix(X)
X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)

y

array([[1.],
       [1.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])

In [330]:
X

array([[1.000e+00, 5.390e+02, 4.100e+01],
       [1.000e+00, 6.740e+02, 1.710e+02],
       [1.000e+00, 1.790e+02, 5.320e+02],
       ...,
       [1.000e+00, 3.110e+02, 1.800e+01],
       [1.000e+00, 7.030e+02, 7.500e+01],
       [1.000e+00, 1.143e+03, 1.530e+02]])

# Classifier  1 : Logistic Regression 

* The sigmoid function is defined as: 

$$ h(z) = \frac{1}{1+\exp^{-z}} $$

It maps the input 'z' to a value that ranges between 0 and 1, and so it can be treated as a probability. 


# 1.1 Apply Logistic Regression and Cross - Validation 

* As a reminder, We have a balanced dataset. 

In [308]:
cross_val_score(LogisticRegression(), X, y.ravel())

array([0.505625, 0.4875  , 0.509375, 0.5     , 0.504375])

# Test Logistic Regression Model

In [309]:
test_x_arr = generate_tweets_list(test_x)

In [310]:
X_test = np.zeros((len(test_x_arr), 3))
for i in range(len(test_x_arr)):
    X_test[i, :]= extract_features(test_x_arr[i], freqs) 
    
X_test[:5]

array([[1.000e+00, 1.019e+03, 3.750e+02],
       [1.000e+00, 5.290e+02, 1.790e+02],
       [1.000e+00, 6.160e+02, 3.250e+02],
       [1.000e+00, 1.840e+02, 7.800e+01],
       [1.000e+00, 6.470e+02, 2.400e+02]])

In [311]:
clf.score(X_test, test_y)

0.6808943089430894

# Hyper-parameter Tuning and Selecting Best Models With Grid Search  