In [2]:
# importing required libraries
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [7]:
# Read the dataset
data = pd.read_csv("dataset/twitter_sentiments.csv")
# view the top rows
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [10]:
data["label"].value_counts(normalize = True)

0    0.929854
1    0.070146
Name: label, dtype: float64

In [12]:
# train test split
train, test = train_test_split(data, test_size = 0.2, stratify = data["label"], random_state = 1213)

# get the shape of train and test split.

print("Shape of Train data {}".format(train.shape))
print("Shape of Test data {}".format(test.shape))


Shape of Train data (25569, 3)
Shape of Test data (6393, 3)


In [15]:
# get the proportion of train and test labels.
print(train["label"].value_counts(normalize = True))
print(test["label"].value_counts(normalize = True))

0    0.929837
1    0.070163
Name: label, dtype: float64
0    0.929923
1    0.070077
Name: label, dtype: float64


In [28]:
# create the object of the TFidfVectorizer, build your model and fit the model with the training data tweets:
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features= 1000, stop_words= ENGLISH_STOP_WORDS)

# Fit & Transform on training data
train_idf = tfidf_vectorizer.fit_transform(train.tweet)
# Fit & Transform on test data
test_idf = tfidf_vectorizer.transform(test.tweet)

## Model building

In [29]:
# create the object of LinearRegression Model
model_LR = LogisticRegression()

# fit the model with the training data
model_LR.fit(train_idf, train.label)

# predict the label on the traning data
predict_train = model_LR.predict(train_idf)

# predict the model on the test data
predict_test = model_LR.predict(test_idf)

# f1 score on train data
f1_train = f1_score(y_true= train.label, y_pred= predict_train)
print("F1 score for train data {:.2f}". format(f1_train))
## >> 0.4888178913738019

f1_test = f1_score(y_true= test.label, y_pred= predict_test)
print("F1 score for test data {:.2f}". format(f1_test))
## >> 0.45751633986928114

F1 score for train data 0.49
F1 score for test data 0.44


In [36]:
# Initialize a PassiveAggressiveClassifier

model_pac = PassiveAggressiveClassifier(max_iter= 50 , random_state = 12)

# fit the m = 123odel with the training data
model_pac.fit(train_idf, train.label)

# predict the label on the traning data
predict_train = model_pac.predict(train_idf)

# predict the model on the test data
predict_test = model_pac.predict(test_idf)

# f1 score on train data
f1_train = f1_score(y_true= train.label, y_pred= predict_train)
print("F1 score for train data {:.2f}". format(f1_train))
## >> 0.4888178913738019

f1_test = f1_score(y_true= test.label, y_pred= predict_test)
print("F1 score for test data {:.2f}". format(f1_test))

F1 score for train data 0.55
F1 score for test data 0.49


In [37]:
# With Pipeline

# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model', PassiveAggressiveClassifier(max_iter= 50 , random_state = 12))])

# fit the pipeline model with the training data                            
pipeline.fit(train.tweet, train.label)


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           

In [40]:
# sample tweet
text = ["What does it take to film a spacewalk in 360 degree VR? We have the team behind the ISS Experience project that is \
        launching their camera to the @Space_Station aboard a @northropgrumman rocket ready to answer your questions on @Reddit"]

# predict the label using the pipeline
pipeline.predict(text)
## >> array([0])

array([0], dtype=int64)

In [41]:
# import joblib
from joblib import dump

# dump the pipeline model

dump(pipeline, filename="tweet_classification.joblib")

['tweet_classification.joblib']

In [42]:
data[data.label == 1]

Unnamed: 0,id,label,tweet
13,14,1,@user #cnn calls #michigan middle school 'buil...
14,15,1,no comment! in #australia #opkillingbay #se...
17,18,1,retweet if you agree!
23,24,1,@user @user lumpy says i am a . prove it lumpy.
34,35,1,it's unbelievable that in the 21st century we'...
...,...,...,...
31934,31935,1,lady banned from kentucky mall. @user #jcpenn...
31946,31947,1,@user omfg i'm offended! i'm a mailbox and i'...
31947,31948,1,@user @user you don't have the balls to hashta...
31948,31949,1,"makes you ask yourself, who am i? then am i a..."
