In [72]:
%autosave 1

Autosaving every 1 seconds


In [16]:
import sklearn

In [68]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score


In [18]:
!unzip trainingandtestdata

Archive:  trainingandtestdata.zip
  inflating: testdata.manual.2009.06.14.csv  
  inflating: training.1600000.processed.noemoticon.csv  


In [49]:
cols = ["Sentiment", "ID", "Date", "Query_string", 'User', "Text"]
BASE_DIR = ''

In [50]:
df_tweets = pd.read_csv(os.path.join(BASE_DIR ,'training.1600000.processed.noemoticon.csv'), encoding = "latin-1", names = cols)

In [51]:
df_tweets.head()

Unnamed: 0,Sentiment,ID,Date,Query_string,User,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [53]:
def clean(raw):
    result = re.sub('<[a][^>]*>(.*?)</[a]>','Link.', raw)
    result = re.sub('&gt;'," ",result)
    result = re.sub('&#x27;',"",result)
    result = re.sub('&quot;',"",result)
    result = re.sub('&#x2F;',"",result)
    result = re.sub('<p>',"",result)
    result = re.sub('</i>',"",result)
    result = re.sub('&#62;',"",result)
    result = re.sub('<i>',"",result)
    result = re.sub('\n',"",result)
    return result

In [57]:
df_tweets['clean_tweets'] = df_tweets.Text.apply(func = clean)

In [60]:
df_tweets.head()


Unnamed: 0,Sentiment,ID,Date,Query_string,User,Text,clean_tweets
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","@nationwideclass no, it's not behaving at all...."


In [64]:
df_train, df_test = train_test_split(df_tweets, test_size = 0.3, stratify = df_tweets['Sentiment'], random_state = 21)
print(df_train.shape, df_test.shape)

(1120000, 7) (480000, 7)


In [70]:
tfidf_vectorizer = TfidfVectorizer(lowercase = True, max_features = 1000, stop_words = ENGLISH_STOP_WORDS)
tfidf_vectorizer.fit(df_train.clean_tweets)

TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [71]:
train_idf = tfidf_vectorizer.transform(df_train.clean_tweets)
test_idf = tfidf_vectorizer.transform(df_test.clean_tweets)

In [74]:
model_rf = RandomForestClassifier(n_estimators = 20)
model_rf.fit(train_idf, df_train.Sentiment)
predict_train = model_rf.predict(train_idf)
predict_test = model_rf.predict(test_idf)

In [76]:
print(sklearn.metrics.precision_score(y_true = df_train.Sentiment, y_pred = predict_train, pos_label = 4))
print(sklearn.metrics.precision_score(y_true = df_test.Sentiment, y_pred = predict_test, pos_label = 4))

0.9068972862437694
0.7249139453318902


In [99]:
pipeline = Pipeline(steps = [('tfidf', TfidfVectorizer(lowercase = True, max_features = 1000, stop_words = ENGLISH_STOP_WORDS)),
                          ('model', RandomForestClassifier(n_estimators = 100))
                         ])
pipeline.fit(df_train.clean_tweets, df_train.Sentiment)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           

In [83]:
import joblib

In [92]:
joblib.dump(pipeline ,'p_model')

['p_model']

In [93]:
jb = joblib.load('p_model')

In [98]:
jb.predict(df_train)

array([4, 4, 4, 4, 4, 4, 4])