## Tweets Classification  
Logistic Regression model that classifies tweets of two users.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import linear_model
import sys
import json
import pandas as pd
import numpy as np
from textacy import preprocessing, preprocess
import nltk
import string
import textacy
from collections import Counter
from scipy import sparse
import csv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

Retrieve text corpus from json files.  
The corpus is two files of tweets from Trump and Clinton official accounts.

In [2]:
with open("TrumpTweets.json", "r") as file:
    TrumpTweets = json.load(file)
with open("ClintonTweets.json", "r") as file:
    ClintonTweets = json.load(file)

Use textacy toolkit to remove special characters, numbers and punctuations.

In [4]:
textacy.__version__

'0.10.0'

In [9]:
def textacy_preprocess(line):
    line = preprocessing.normalize_quotation_marks(line)
    line = preprocessing.normalize_unicode(line)
    line = preprocessing.normalize_whitespace(line)
    line = preprocessing.remove_accents(line)
    line = preprocessing.remove_punctuation(line)
    line = preprocessing.replace_currency_symbols(line)
    line = preprocessing.replace_emails(line)
    line = preprocessing.replace_emojis(line)
    line = preprocessing.replace_hashtags(line)
    line = preprocessing.replace_numbers(line)
    line = preprocessing.replace_phone_numbers(line)
    line = preprocessing.replace_urls(line)
    line = preprocessing.replace_user_handles(line)
    return line

In [11]:
tA = list(TrumpTweets['text'].values())
cA = list(ClintonTweets['text'].values())
tA = [textacy_preprocess(line.translate(str.maketrans('', '', string.digits))) for line in list(set(tA))]
cA = [textacy_preprocess(line.translate(str.maketrans('', '', string.digits))) for line in list(set(cA))]


Extract features of corpus using TF-IDF Vectorizer and obtain training data for the classifier

In [12]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(
    1, 3), encoding='ascii', strip_accents='ascii', stop_words='english', max_features=2500)

X = tA + cA
y = [1]*len(tA) + [0]*len(cA)
dataset = tfidf_vectorizer.fit_transform(X, y=y).toarray()
Features = tfidf_vectorizer.get_feature_names()
datasetDF = pd.DataFrame(dataset)
Xset = pd.DataFrame(datasetDF)
Yset = pd.DataFrame(y)

Split dataset into 80% train, 20% test sets.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    Xset.values.tolist(), Yset.values.tolist(), test_size=0.2, random_state=42)

Fit a logistic regression model using SGD optimizer.  
Do a grid search for hyperparameter tuning.

In [14]:
logRegModel = linear_model.SGDClassifier(loss='log', max_iter=1000, tol=1e-3)
parameters = {'alpha': [0.00001, 0.00003, 0.0001, 0.0003, 0.001,
                        0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]}
optLRModel = GridSearchCV(logRegModel, parameters, cv=5)
optLRModel = optLRModel.fit(X_train, np.ravel(y_train))
# datasetDF.to_csv('dataset.csv')
print("Training set score: " + str(optLRModel.score(X_train, np.ravel(y_train))))
print("Test set score: " + str(optLRModel.score(X_test, np.ravel(y_test))))

Training set score: 0.954688731284476
Test set score: 0.9212598425196851


Prediction Demo: 

In [22]:
for i in range(10):
    print("1. Trump's tweet")
    print(tA[i])
    print("*******************************************")
    print("2. Clinton's tweet")
    print(cA[i])
    print("*******************************************")
    l1 = textacy_preprocess(tA[i].translate(str.maketrans('', '', string.digits)))
    x1 = tfidf_vectorizer.transform([l1])
    l2 = textacy_preprocess(cA[i].translate(str.maketrans('', '', string.digits)))
    x2 = tfidf_vectorizer.transform([l2])
    print("1. Prediction: ", "Trump" if (optLRModel.predict(x1) == 1) else "Clinton")
    print("2. Prediction: ", "Trump" if (optLRModel.predict(x2) == 1) else "Clinton")
    print("*******************************************")

# str1 = "We’re going to put a lot of coal miners and coal companies out of business."
# str2 = "When Mexico sends its people, they're not sending their best,"
# str3 = "I've read hundreds of books about China over the decades. I know the Chinese. I've made a lot of money with the Chinese. I understand the Chinese mind."
# str4 = "They're sending people that have lots of problems, and they're bringing those problems with us. They're bringing drugs. They're bringing crime. They're rapists. And some, I assume, are good people."
# str5 = "They are often connected to big drug cartels, they are not just gangs of kids anymore. They are often the kinds of kids that are called superpredators — no conscience, no empathy. We can talk about why they ended up that way, but first, we have to bring them to heel."
# l1 = textacy_preprocess(str1.translate(str.maketrans('', '', string.digits)))
# l2 = textacy_preprocess(str2.translate(str.maketrans('', '', string.digits)))
# l3 = textacy_preprocess(str3.translate(str.maketrans('', '', string.digits)))
# l4 = textacy_preprocess(str4.translate(str.maketrans('', '', string.digits)))
# l5 = textacy_preprocess(str5.translate(str.maketrans('', '', string.digits)))

# x1 = tfidf_vectorizer.transform([l1])
# x2 = tfidf_vectorizer.transform([l2])
# x3 = tfidf_vectorizer.transform([l3])
# x4 = tfidf_vectorizer.transform([l4])
# x5 = tfidf_vectorizer.transform([l5])

# print("/*******************/")
# print(optLRModel.predict(x1))
# print(optLRModel.predict(x2))
# print(optLRModel.predict(x3))
# print(optLRModel.predict(x4))
# print(optLRModel.predict(x5))


1. Trump's tweet
RT  CwElliott   dbongino You are correct     Dan  A bunch of us have been talking  and this is just the DNC buying and paying for the   
*******************************************
2. Clinton's tweet
Every minute of Michelle Obama s minute speech today is worth watching  https   t co ZlEXYaUa
*******************************************
1. Prediction:  Trump
2. Prediction:  Clinton
*******************************************
1. Trump's tweet
RT  callme Chari   newscomauHQ If he s anything like Trump congratulations to you all  You have a leader who loves your country and will pu   
*******************************************
2. Clinton's tweet
We are not going to build a wall and deport million people  That s not happening   DebateNight
*******************************************
1. Prediction:  Clinton
2. Prediction:  Clinton
*******************************************
1. Trump's tweet
Finally  great news at the Border  https   t co nofzYaQs
***************************