In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Helper Functions

In [38]:
def clean_tweets(df):
    temp_df = df.copy()
    
    temp_df.SentimentText = [re.sub(r"https?:\/\/.*\/[a-zA-Z0-9]*", "", w) for w in temp_df.SentimentText]
    temp_df.SentimentText = [re.sub(r"&amp;quot;|&amp;amp'", "", w) for w in temp_df.SentimentText]
    temp_df.SentimentText = [re.sub(r"@[a-zA-Z0-9]*", "", w) for w in temp_df.SentimentText]
    temp_df.SentimentText = [re.sub(r"\$[a-zA-Z0-9]*", "", w) for w in temp_df.SentimentText]
    temp_df.SentimentText = [re.sub(r"[0-9]*", "", w) for w in temp_df.SentimentText]
    temp_df.SentimentText = [w.strip() for w in temp_df.SentimentText]
    
    temp_df.SentimentText = temp_df.SentimentText.str.replace('[^\w\s]','')
        
    return temp_df

# Analysis

## Creating DF and Cleaning Tweets

In [3]:
# skips problem lines for now
df = pd.read_csv("Sentiment Analysis Dataset.csv",error_bad_lines=False, encoding='utf-8')

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [39]:
clean_df = clean_tweets(df)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(clean_df.SentimentText, clean_df.Sentiment, test_size=0.20, random_state=42)

In [41]:
vectorizer_train = CountVectorizer()
X_train = vectorizer_train.fit_transform(X_train)

In [42]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train)
X_train_tf = tf_transformer.transform(X_train)

In [43]:
vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
X_test = vectorizer_test.transform(X_test)

## Multinomial Naive Bayes

In [44]:
from sklearn.naive_bayes import MultinomialNB

In [45]:
clf = MultinomialNB()
clf.fit(X_train_tf,y_train)
y_pred = clf.predict(X_test)

In [46]:
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.77      0.82      0.79    157670
          1       0.80      0.75      0.78    158053

avg / total       0.78      0.78      0.78    315723



## Bernoulli Naive Bayes

In [23]:
from sklearn.naive_bayes import BernoulliNB

In [47]:
clf = BernoulliNB()
clf.fit(X_train_tf,y_train)
y_pred = clf.predict(X_test)

In [48]:
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.78      0.78      0.78    157670
          1       0.78      0.78      0.78    158053

avg / total       0.78      0.78      0.78    315723



## SVM

In [52]:
from sklearn import svm

In [47]:
clf = svm.SVM()
clf.fit(X_train_tf,y_train)
y_pred = clf.predict(X_test)

In [48]:
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.78      0.78      0.78    157670
          1       0.78      0.78      0.78    158053

avg / total       0.78      0.78      0.78    315723



## Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

### Logistic Regression

In [None]:
clf = LogisticRegression()
clf.fit(X_train_tf,y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

## Exploratory Data Analysis