In [3]:
import pandas as pd #used to perform data manipulation and analysis
import numpy as np #used to perform a wide variety of mathematical operations on arrays
import string #used to obtain information in the string and manipulate the string overall
import re #used as a regular expression to find particular patterns and process it
import nltk #a natural language processing toolkit module associated in anaconda
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import warnings #to manipulate warnings details
nltk.download('stopwords')

warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv('TweetSentimentData.csv', encoding='latin-1')
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [5]:
df.columns = ["target","id","date","flag","user","text"]
df

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1599999 non-null  int64 
 1   id      1599999 non-null  int64 
 2   date    1599999 non-null  object
 3   flag    1599999 non-null  object
 4   user    1599999 non-null  object
 5   text    1599999 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [7]:
df.sort_values("id", axis = 0,inplace = True, na_position ='last')
df

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
799994,0,2329205009,Thu Jun 25 10:28:28 PDT 2009,NO_QUERY,dandykim,Sick Spending my day laying in bed listening ...
799995,0,2329205038,Thu Jun 25 10:28:28 PDT 2009,NO_QUERY,bigenya,Gmail is down?
799996,0,2329205473,Thu Jun 25 10:28:30 PDT 2009,NO_QUERY,LeeLHoke,rest in peace Farrah! So sad
799997,0,2329205574,Thu Jun 25 10:28:30 PDT 2009,NO_QUERY,davidlmulder,@Eric_Urbane Sounds like a rival is flagging y...


In [8]:
# removes pattern in the input text
#This function works to remove certain patterns in the text for preprocessing
def remove_pattern(input_txt, pattern):  
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt

In [9]:
# remove twitter handles (@user)
#"@[\w]*" is the twitter handle pattern to remove in the text for preprocessing
df['clean_tweet'] = np.vectorize(remove_pattern)(df['text'], "@[\w]*")


In [10]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,clean_tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball. Managed to s...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am..."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,not the whole crew


In [11]:
# remove special characters, numbers and punctuations
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]", " ")
df.head()

Unnamed: 0,target,id,date,flag,user,text,clean_tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball. Managed to s...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am..."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,not the whole crew


In [12]:
# replacing emoji
df['clean_tweet'] = df['clean_tweet'].str.replace(r'[^\w\s]', " ")
df.head()

Unnamed: 0,target,id,date,flag,user,text,clean_tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball. Managed to s...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am..."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,not the whole crew


In [13]:
stop_words = nltk.corpus.stopwords.words('english')
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [14]:
df['clean_tweet'] = df['clean_tweet'].apply(remove_stop_words)
df.head()

Unnamed: 0,target,id,date,flag,user,text,clean_tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset can't update Facebook texting it... migh...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times ball. Managed save 50% The ...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feels itchy like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","no, behaving all. i'm mad. here? I can't see t..."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,whole crew


In [15]:
# individual words considered as tokens
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()


0    [upset, can't, update, Facebook, texting, it.....
1    [I, dived, many, times, ball., Managed, save, ...
2              [whole, body, feels, itchy, like, fire]
3    [no,, behaving, all., i'm, mad., here?, I, can...
4                                        [whole, crew]
Name: clean_tweet, dtype: object

In [16]:
#Stemmer.stem() converts certain words into its simpler version.
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized_tweet.head()

0    [upset, can't, updat, facebook, text, it..., m...
1    [i, dive, mani, time, ball., manag, save, 50%,...
2               [whole, bodi, feel, itchi, like, fire]
3    [no,, behav, all., i'm, mad., here?, i, can't,...
4                                        [whole, crew]
Name: clean_tweet, dtype: object

In [17]:
#Combining the tokenized words into a sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = " ".join(tokenized_tweet[i])
    
df['clean_tweet'] = tokenized_tweet
df.head()


Unnamed: 0,target,id,date,flag,user,text,clean_tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset can't updat facebook text it... might cr...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,i dive mani time ball. manag save 50% the rest...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","no, behav all. i'm mad. here? i can't see there."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,whole crew


In [18]:
#Extraction of the data into vectors for training and testing
# feature extraction
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000)
bow = bow_vectorizer.fit_transform(df['clean_tweet'])

In [19]:
X = df['clean_tweet'].values
y = df['target'].values

In [20]:
#Convert a collection of raw documents to a matrix of TF-IDF features.
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer(sublinear_tf=True,ngram_range=(1,2), stop_words = stop_words, strip_accents = 'unicode', max_features = 500000)
X = vector.fit_transform(X)
print(f'Vector fitted.')

Vector fitted.


In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=101, test_size=0.20)

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold


In [23]:
def model_train(model, X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    y_pred_tr = model.predict(X_train)
    y_pred = model.predict(X_test)

    print("--------------------Training Performance---------------------")
    print(classification_report(y_train,y_pred_tr))
    print("-------------------------------------------------------------")
    print("--------------------Testing Performance----------------------")
    print(classification_report(y_test,y_pred))

In [24]:
model = MultinomialNB()
model_train(model, x_train, x_test, y_train, y_test)

--------------------Training Performance---------------------
              precision    recall  f1-score   support

           0       0.83      0.82      0.82    639717
           4       0.82      0.83      0.83    640282

    accuracy                           0.83   1279999
   macro avg       0.83      0.83      0.83   1279999
weighted avg       0.83      0.83      0.83   1279999

-------------------------------------------------------------
--------------------Testing Performance----------------------
              precision    recall  f1-score   support

           0       0.78      0.77      0.77    160282
           4       0.77      0.78      0.77    159718

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [25]:
model = LogisticRegression()
model_train(model, x_train, x_test, y_train, y_test)

--------------------Training Performance---------------------
              precision    recall  f1-score   support

           0       0.83      0.80      0.81    639717
           4       0.81      0.83      0.82    640282

    accuracy                           0.82   1279999
   macro avg       0.82      0.82      0.82   1279999
weighted avg       0.82      0.82      0.82   1279999

-------------------------------------------------------------
--------------------Testing Performance----------------------
              precision    recall  f1-score   support

           0       0.80      0.77      0.79    160282
           4       0.78      0.81      0.79    159718

    accuracy                           0.79    320000
   macro avg       0.79      0.79      0.79    320000
weighted avg       0.79      0.79      0.79    320000



In [26]:
model = RandomForestClassifier(n_estimators=100,max_depth=15,max_features='sqrt')
model_train(model, x_train, x_test, y_train, y_test)

--------------------Training Performance---------------------
              precision    recall  f1-score   support

           0       0.74      0.68      0.71    639717
           4       0.70      0.76      0.73    640282

    accuracy                           0.72   1279999
   macro avg       0.72      0.72      0.72   1279999
weighted avg       0.72      0.72      0.72   1279999

-------------------------------------------------------------
--------------------Testing Performance----------------------
              precision    recall  f1-score   support

           0       0.73      0.68      0.70    160282
           4       0.70      0.75      0.72    159718

    accuracy                           0.71    320000
   macro avg       0.72      0.71      0.71    320000
weighted avg       0.72      0.71      0.71    320000



In [27]:
LR = LogisticRegression()
LR.fit(x_train,y_train)

In [28]:
from sklearn.metrics import accuracy_score


pred_train = LR.predict(x_train)
accuracy_score(y_train,pred_train)

0.8180084515691027

In [29]:
pred = LR.predict(x_test)
accuracy_score(y_test,pred)

0.789171875

In [30]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga']
}


In [31]:
# Use Stratified K-Fold for balanced evaluation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [32]:
# Perform grid search
random_search = RandomizedSearchCV(LogisticRegression(), param_grid, cv=kfold, scoring='f1_macro', n_iter=100)


In [33]:
'''from sklearn.model_selection import cross_val_score
for solver in ['lbfgs', 'sag', 'saga']:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        for penalty in ['l2']:
            model = LogisticRegression(C=C, penalty=penalty, solver=solver)
            scores = cross_val_score(model, x_train, y_train, cv=kfold, scoring='f1_macro')
            mean_score = scores.mean()
            print(f"C={C}, solver={solver}, penalty={penalty}, F1-macro={mean_score:.4f}")
    print()'''


C=0.001, solver=lbfgs, penalty=l2, F1-macro=0.7298
C=0.01, solver=lbfgs, penalty=l2, F1-macro=0.7513
C=0.1, solver=lbfgs, penalty=l2, F1-macro=0.7754
C=1, solver=lbfgs, penalty=l2, F1-macro=0.7877
C=10, solver=lbfgs, penalty=l2, F1-macro=0.7827
C=100, solver=lbfgs, penalty=l2, F1-macro=0.7820

C=0.001, solver=sag, penalty=l2, F1-macro=0.7301
C=0.01, solver=sag, penalty=l2, F1-macro=0.7521
C=0.1, solver=sag, penalty=l2, F1-macro=0.7759
C=1, solver=sag, penalty=l2, F1-macro=0.7890
C=10, solver=sag, penalty=l2, F1-macro=0.7741
C=100, solver=sag, penalty=l2, F1-macro=0.7390

C=0.001, solver=saga, penalty=l2, F1-macro=0.7301
C=0.01, solver=saga, penalty=l2, F1-macro=0.7521
C=0.1, solver=saga, penalty=l2, F1-macro=0.7759
C=1, solver=saga, penalty=l2, F1-macro=0.7890
C=10, solver=saga, penalty=l2, F1-macro=0.7741
C=100, solver=saga, penalty=l2, F1-macro=0.7416



In [34]:
'''from sklearn.model_selection import cross_val_score
for solver in ['lbfgs', 'sag', 'saga']:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        for penalty in ['l2']:
            model = LogisticRegression(C=C, penalty=penalty, solver=solver)
            scores = cross_val_score(model, x_test, y_test, cv=kfold, scoring='f1_macro')
            mean_score = scores.mean()
            print(f"C={C}, solver={solver}, penalty={penalty}, F1-macro={mean_score:.4f}")
    print()'''



C=0.001, solver=lbfgs, penalty=l2, F1-macro=0.7116
C=0.01, solver=lbfgs, penalty=l2, F1-macro=0.7368
C=0.1, solver=lbfgs, penalty=l2, F1-macro=0.7607
C=1, solver=lbfgs, penalty=l2, F1-macro=0.7761
C=10, solver=lbfgs, penalty=l2, F1-macro=0.7639
C=100, solver=lbfgs, penalty=l2, F1-macro=0.7417

C=0.001, solver=sag, penalty=l2, F1-macro=0.7105
C=0.01, solver=sag, penalty=l2, F1-macro=0.7372
C=0.1, solver=sag, penalty=l2, F1-macro=0.7610
C=1, solver=sag, penalty=l2, F1-macro=0.7760
C=10, solver=sag, penalty=l2, F1-macro=0.7635
C=100, solver=sag, penalty=l2, F1-macro=0.7312

C=0.001, solver=saga, penalty=l2, F1-macro=0.7112
C=0.01, solver=saga, penalty=l2, F1-macro=0.7372
C=0.1, solver=saga, penalty=l2, F1-macro=0.7610
C=1, solver=saga, penalty=l2, F1-macro=0.7760
C=10, solver=saga, penalty=l2, F1-macro=0.7635
C=100, solver=saga, penalty=l2, F1-macro=0.7336

