In [640]:
import pandas as pd
import spacy
import en_core_web_sm

In [641]:
df = pd.read_csv('Resources/training_data_large.csv', encoding="ISO-8859-1", header=None)

In [642]:
new_df = df[[0, 5]]
new_df[0] = new_df[0].map({0:0, 4:1})
new_df.dropna(how='any')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,0,5
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,1,Just woke up. Having no school is the best fee...
1599996,1,TheWDB.com - Very cool to hear old Walt interv...
1599997,1,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...


# Tokenizer

In [644]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from string import punctuation
tokenizer = nltk.tokenize.casual.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)


In [671]:
def custom_tokenizer(tweet):
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', tweet) # remove URLs
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
        doc = tokenizer.tokenize(tweet)
        stopwords = list(punctuation) + ['url', 'rt', '...', '’', '‘', '…']
        return [WordNetLemmatizer().lemmatize(token) for token in doc if token not in stopwords]

In [690]:
#TESTING
test_str = 'RT @TomFitton: The unprecedented harassment of @RealDonaldTrump continues....https://t.co/cHc8yA5bSD'
test = custom_tokenizer(test_str)
print(test)

['the', 'unprecedented', 'harassment', 'of', 'continues']


# Vectorization

In [647]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [648]:
X = new_df[5]
y = new_df[0]
vec = CountVectorizer(tokenizer=custom_tokenizer)

In [649]:
clf = Pipeline([('vectorizer', vec), 
                ('tfidf', TfidfTransformer()), 
                ('classsifier', LogisticRegression(max_iter=1000))])

In [650]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    train_size=0.7, 
                                                    random_state = 42)

In [651]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(tokenizer=<function custom_tokenizer at 0x7f98a2616378>)),
                ('tfidf', TfidfTransformer()),
                ('classsifier', LogisticRegression(max_iter=1000))])

In [675]:
y_pred = clf.predict(X_test)

In [676]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.79      0.80    239361
           1       0.80      0.81      0.80    240639

    accuracy                           0.80    480000
   macro avg       0.80      0.80      0.80    480000
weighted avg       0.80      0.80      0.80    480000

[[189299  50062]
 [ 45713 194926]]
0.80046875


In [682]:
from joblib import dump, load

In [661]:
dump(clf, 'sentiment_model.joblib')

['sentiment_model.joblib']