In [108]:
#Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Data Preprocessing and Feature Engineering
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [109]:
train_tweets = pd.read_csv("tweets.csv", nrows = 2000000)
test_tweets = pd.read_csv("tweets.csv", nrows=10000)
train_tweets['content'] = train_tweets['content'].astype(str)
test_tweets['content'] = test_tweets['content'].astype(str)
train_tweets = train_tweets[['class','content']]
test = test_tweets['content']

In [110]:
import nltk
nltk.download('stopwords')
def text_processing(content):
    
    #Generating the list of words in the tweet (hastags and other punctuations removed)
    def form_sentence(content):
        tweet_blob = TextBlob(content)
        return ' '.join(tweet_blob.words)
    new_tweet = form_sentence(content)
    
    #Removing stopwords and words with unusual symbols
    def no_user_alpha(content):
        tweet_list = [ele for ele in content.split() if ele != 'user']
        clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
        clean_s = ' '.join(clean_tokens)
        clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
        return clean_mess
    no_punc_tweet = no_user_alpha(new_tweet)
    
    #Normalizing the words in tweets 
    def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return normalized_tweet
    
    
    return normalization(no_punc_tweet)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simransingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [111]:
train_tweets['tweet_list'] = train_tweets['content'].apply(text_processing)
test_tweets['tweet_list'] = test_tweets['content'].apply(text_processing)

In [118]:
train_tweets[train_tweets['class']==0].drop('content',axis=1).head()

Unnamed: 0,class,tweet_list
0,0,"[switchfoot, http, Awww, bummer, shoulda, get, David, Carr, Third, Day]"
1,0,"[upset, ca, update, Facebook, texting, might, cry, result, School, today, also, Blah]"
2,0,"[Kenichan, dive, many, time, ball, Managed, save, rest, go, bound]"
3,0,"[whole, body, feel, itchy, like, fire]"
4,0,"[nationwideclass, behave, mad, ca, see]"


In [112]:
X = train_tweets['content']
y = train_tweets['class']
test = test_tweets['content']

In [113]:
from sklearn.model_selection import train_test_split
msg_train, msg_test, label_train, label_test = train_test_split(train_tweets['content'], train_tweets['class'], test_size=0.2)

In [114]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_processing)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])
pipeline.fit(msg_train,label_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_processing at 0x7f8db71974c0>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [115]:
predictions = pipeline.predict(msg_test)

print(classification_report(predictions,label_test))
print('\n')
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98    333138
           1       0.82      0.98      0.89     66862

    accuracy                           0.96    400000
   macro avg       0.91      0.97      0.93    400000
weighted avg       0.97      0.96      0.96    400000



[[318561  14577]
 [  1316  65546]]
0.9602675
