# Mounting content from Google Drive.

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Part 1: Loading and preprocessing the data

In [4]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import seaborn as sns
from sklearn.model_selection import train_test_split
import sys
import os
import re

tweet_path = "/content/gdrive/My Drive/SentimentTweets.csv"

#Creating the dataframe and converting every uppercase character to lowercase using the str.lower() function.
tweet_df = pd.read_csv(tweet_path).apply(lambda x: x.astype(str).str.lower())

#We will substitute every unwanted character with ' '. Here we remove the URLs.
tweet_df['text'] = tweet_df['text'].apply(lambda y: re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', " ", y , flags=re.MULTILINE) )

#Here we remove escape characters such as \n, \x and \u.
tweet_df['text'] = tweet_df['text'].apply(lambda z: re.sub(r'\\n', " ", z , flags=re.MULTILINE) )
tweet_df['text'] = tweet_df['text'].apply(lambda z: re.sub(r'\\x..', " ", z , flags=re.MULTILINE) )
tweet_df['text'] = tweet_df['text'].apply(lambda z: re.sub(r'\\u....', " ", z , flags=re.MULTILINE) )

#And finally we remove any other remaining symbols by removing every non-alphabetic character.
tweet_df['text'] = tweet_df['text'].apply(lambda k: re.sub("[^a-z]+", " ", k, flags=re.MULTILINE) )

#Splitting the dataframe into train and test subsets.
train_set, test_set = train_test_split(tweet_df, test_size=0.2, random_state=42)

# Part 2: Stemming, lemmatization and initializing the TF-IDF array

In [13]:
#Showing the 'text' column of the training set after the preprocessing.
train_set['text']

263132      lalavazquez happy mother s day have a wonderf...
615448     i m really going to miss jonasbrothers th june...
158240     wonders kenapa ya terngiang lagu you never giv...
1218246     beyondbirthday no problem can you do tha same...
414653      billyscallywag you re welcome love the spider...
                                 ...                        
110268      zoeox iii can t breathe easy legends y ooh th...
259178                      grienke with another loss today 
131932     so full of energy today spent the morning sing...
671155      is about to watch the making of radioactive dvd 
121958     demolished some peanut butter honey toast and ...
Name: text, Length: 1024000, dtype: object

# Stemming

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import warnings
warnings.filterwarnings('ignore') 

import nltk


#Objects needed for the stemming.
from nltk.stem import PorterStemmer, WordNetLemmatizer
porter_stemmer = PorterStemmer()

#Defining a stem_sentences function that will stem our text and return it in string format.
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

#Stemming the text.
train_set['stemmed_text'] = train_set['text'].apply(stem_sentences)
test_set['stemmed_text'] = test_set['text'].apply(stem_sentences)

#Showing the 'text' column of the training set after stemming.
train_set['stemmed_text']

263132     lalavazquez happi mother s day have a wonder a...
615448     i m realli go to miss jonasbroth th june i wan...
158240     wonder kenapa ya terngiang lagu you never give...
1218246     beyondbirthday no problem can you do tha same me
414653     billyscallywag you re welcom love the spiderma...
                                 ...                        
110268     zoeox iii can t breath easi legend y ooh thank...
259178                          grienk with anoth loss today
131932     so full of energi today spent the morn sing al...
671155            is about to watch the make of radioact dvd
121958     demolish some peanut butter honey toast and is...
Name: stemmed_text, Length: 1024000, dtype: object

# Lemmatization

In [9]:

#Objects needed for the lemmatization.
tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

#Defining a lemmatize function that will lemmatize our text and return it in string format.
def lemmatize(text):
    string_list = [lemmatizer.lemmatize(word) for word in tokenizer.tokenize(text)]
    list_to_str = ' '.join([str(element) for element in string_list])
    return list_to_str

#Performing lemmatization on the stemmed text.
train_set['lemmatized_text'] = train_set['stemmed_text'].apply(lemmatize)
test_set['lemmatized_text'] = test_set['stemmed_text'].apply(lemmatize)

#Showing the lemmatized text of the training set.
train_set['lemmatized_text']

263132     lalavazquez happi mother s day have a wonder a...
615448     i m realli go to miss jonasbroth th june i wan...
158240     wonder kenapa ya terngiang lagu you never give...
1218246     beyondbirthday no problem can you do tha same me
414653     billyscallywag you re welcom love the spiderma...
                                 ...                        
110268     zoeox iii can t breath easi legend y ooh thank...
259178                          grienk with anoth loss today
131932     so full of energi today spent the morn sing al...
671155            is about to watch the make of radioact dvd
121958     demolish some peanut butter honey toast and is...
Name: lemmatized_text, Length: 1024000, dtype: object

# Initializing the TF-IDF array

In [10]:
#Initializing TF-IDF array with unigrams and bigrams.
tfidf = TfidfVectorizer(ngram_range=(1,2))
tfs_train = tfidf.fit_transform(train_set['lemmatized_text'])
tfs_test = tfidf.transform(test_set['lemmatized_text'])

#Getting the Y's.
Y_train = train_set['target']
Y_test = test_set['target']

# Part 3: Classification and scores

In [11]:
from sklearn.metrics import classification_report

#Performing the classification. max_iter argument was increased because the default one gave congression errors.
reg_model = LogisticRegression(max_iter=500)
reg_model.fit(tfs_train, Y_train)
y_test_pred = reg_model.predict(tfs_test)

#The classification report will give us the scores for precision, recall and F1 metrics.
print(classification_report(Y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82    128091
           4       0.82      0.82      0.82    127909

    accuracy                           0.82    256000
   macro avg       0.82      0.82      0.82    256000
weighted avg       0.82      0.82      0.82    256000



# Part 4: Cross Validation

In [12]:
from sklearn.model_selection import cross_val_score

#Finally we also perform a cross validation on the data.
cross_val_score(reg_model,tfs_train, Y_train, cv = 10, n_jobs=4)

array([0.81893555, 0.81916992, 0.81870117, 0.81824219, 0.81874023,
       0.81855469, 0.81731445, 0.81958008, 0.81974609, 0.81847656])