**IMPORTING LIBRARIES**

In [None]:
import spacy as sp
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import pandas as pd

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

**DATASET INFORMATION**

In [None]:
df=pd.read_csv('train.csv', encoding='latin1')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB


In [None]:
df=df[['sentiment','text']]

In [None]:
df=df.dropna()

In [None]:
df.shape

(27480, 2)

**DATA PRE-PROCESSING** **using SPACY**

In [None]:
for index, row in df.iterrows():   ##df.iterrows iterates over each row in datafrae, index variable represents value of row right now
   doc = nlp(row['text'])   ## accessing the text of current row
   tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
   dataa = " ".join(tokens)
   df.at[index, 'processed_text'] = dataa

In [None]:
df['processed_text']

0                                           I`d respond go
1                                  Sooo SAD miss San Diego
2                                               boss bully
3                                          interview leave
4                                 son couldn`t release buy
                               ...                        
27476      wish come u Denver   husband lose job can`t ...
27477      I`ve wonder rake   client clear .NET don`t f...
27478      yay good enjoy break probably need hectic we...
27479                                              worth  
27480                      flirting go ATG smile yay   hug
Name: processed_text, Length: 27480, dtype: object

In [None]:
df

Unnamed: 0,sentiment,text,processed_text
0,neutral,"I`d have responded, if I were going",I`d respond go
1,negative,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD miss San Diego
2,negative,my boss is bullying me...,boss bully
3,negative,what interview! leave me alone,interview leave
4,negative,"Sons of ****, why couldn`t they put them on t...",son couldn`t release buy
...,...,...,...
27476,negative,wish we could come see u on Denver husband l...,wish come u Denver husband lose job can`t ...
27477,negative,I`ve wondered about rake to. The client has ...,I`ve wonder rake client clear .NET don`t f...
27478,positive,Yay good for both of you. Enjoy the break - y...,yay good enjoy break probably need hectic we...
27479,positive,But it was worth it ****.,worth




**DATA PREPROCESSING USING NLTK**

In [None]:
dfnew=df.copy()

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

for index, row in df.iterrows():
    text = row['text']
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and not token in stop_words]  # Lemmatize,filter out non-alphabetic tokens,remove stopwords
    dataaa = " ".join(tokens)
    dfnew.at[index, 'processed__text'] = dataaa


In [None]:
dfnew

Unnamed: 0,sentiment,text,processed_text,processed__text
0,neutral,"I`d have responded, if I were going",I`d respond go,responded going
1,negative,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD miss San Diego,sooo sad miss san diego
2,negative,my boss is bullying me...,boss bully,bos bullying
3,negative,what interview! leave me alone,interview leave,interview leave alone
4,negative,"Sons of ****, why couldn`t they put them on t...",son couldn`t release buy,son put release already bought
...,...,...,...,...
27476,negative,wish we could come see u on Denver husband l...,wish come u Denver husband lose job can`t ...,wish could come see u denver husband lost job ...
27477,negative,I`ve wondered about rake to. The client has ...,I`ve wonder rake client clear .NET don`t f...,wondered rake client made clear force devs lea...
27478,positive,Yay good for both of you. Enjoy the break - y...,yay good enjoy break probably need hectic we...,yay good enjoy break probably need hectic week...
27479,positive,But it was worth it ****.,worth,worth


**TFIDF AND NAIVE BAYES CLASSIFIER** using DATA CLEANED FROM SPACY

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score

X=df['processed_text']
y=df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.15, random_state=42 )

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)

X_test_tfidf = vectorizer.transform(X_test)

classifier = MultinomialNB()

classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1score = f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print('F1 Score', f1score)


Precision: 0.6575610572371137
Recall: 0.621300339640951
Accuracy: 0.621300339640951
F1 Score 0.6142935511324495


In [None]:
X_train.shape

(23358,)

In [None]:
X_test.shape

(4122,)

In [None]:
y_train.shape

(23358,)

In [None]:
y_test.shape

(4122,)

**TFIDF AND NAIVE BAYES CLASSIFIER** using DATA CLEANED FROM NLTK

In [None]:
X=dfnew['processed__text']
y=dfnew['sentiment']

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size=0.15, random_state=42 )

vectorizer = TfidfVectorizer()

Xtrain_tfidf = vectorizer.fit_transform(Xtrain)

Xtest_tfidf = vectorizer.transform(Xtest)

classifier = MultinomialNB()

classifier.fit(Xtrain_tfidf, ytrain)

ypred = classifier.predict(Xtest_tfidf)

accuracyy = accuracy_score(ytest, ypred)
precisionn= precision_score(ytest, ypred, average='weighted')
recalll = recall_score(ytest, ypred, average='weighted')
f1scoree = f1_score(ytest, ypred, average='weighted')

print("Precision:", precisionn)
print("Recall:", recalll)
print("Accuracy:", accuracyy)
print('F1 Score', f1scoree)


Precision: 0.6645151252177114
Recall: 0.6266375545851528
Accuracy: 0.6266375545851528
F1 Score 0.6196434364946701


**USING SUPERVISED VECTOR MACHINES** on SPACY data




In [None]:
from sklearn import svm
from sklearn.metrics import classification_report

X=df['processed_text']
y=df['sentiment']

X__train, X__test, y__train, y__test = train_test_split(X,y, test_size=0.15, random_state=42 )

vectorizerr = TfidfVectorizer()

X__train_tfidf = vectorizerr.fit_transform(X__train)

X__test_tfidf = vectorizerr.transform(X__test)

classifier=svm.SVC(kernel='linear', random_state=0, C=1.0, gamma=0.0,coef0=0.0)
classifier.fit(X_train_tfidf,y_train)


y__pred=classifier.predict(X_test_tfidf)

acuracy = accuracy_score(y__test, y__pred)
precsion = precision_score(y__test, y__pred, average='weighted')
recal = recall_score(y__test, y__pred, average='weighted')
f1scor = f1_score(y__test, y__pred, average='weighted')

print("Precision:", precsion)
print("Recall:", recal)
print("Accuracy:", acuracy)
print('F1 Score', f1scor)

Precision: 0.7073166384443653
Recall: 0.7023289665211062
Accuracy: 0.7023289665211062
F1 Score 0.7018503757237503


**USING SUPERVISED VECTOR MACHINES** on NLTK data

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report

X=dfnew['processed__text']
y=dfnew['sentiment']

X__trainn, X__testt, y__trainn, y__testt = train_test_split(X,y, test_size=0.15, random_state=42 )

vectorizerr = TfidfVectorizer()

X_train_tfidff = vectorizerr.fit_transform(X__trainn)

X_test_tfidff = vectorizerr.transform(X__testt)

classifier=svm.SVC(kernel='linear', random_state=0, C=1.0, gamma=0.0,coef0=0.0)
classifier.fit(X_train_tfidff,y_train)


y__predd=classifier.predict(X_test_tfidff)

acuracyy = accuracy_score(y__testt, y__predd)
precsionn = precision_score(y__testt, y__predd, average='weighted')
reccal = recall_score(y__testt, y__predd, average='weighted')
f1scr = f1_score(y__testt, y__predd, average='weighted')

print("Precision:", precsionn)
print("Recall:", reccal)
print("Accuracy:", acuracyy)
print('F1 Score', f1scr)

Precision: 0.7044358516814386
Recall: 0.6982047549733139
Accuracy: 0.6982047549733139
F1 Score 0.6975568637984898
