In [164]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import string
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [165]:
train = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Data Understanding

In [166]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [167]:
test_data.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [168]:
train.shape

(7920, 3)

In [169]:
test_data.shape

(1953, 2)

In [170]:
train.sample(5)

Unnamed: 0,id,label,tweet
466,467,0,Sunset One Minute Ago #zeeland #dezeekust #bui...
6149,6150,0,Mornings’ duckies x. #sunday #morningwalk #iph...
3671,3672,0,My #urkel Technically he was dressed to be an ...
1857,1858,0,RT https://twitter.com/AbensonPH/status/700923...
7813,7814,0,#iPhone app #theme #preview. We’d your #inputs...


In [171]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB


In [172]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1953 entries, 0 to 1952
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1953 non-null   int64 
 1   tweet   1953 non-null   object
dtypes: int64(1), object(1)
memory usage: 30.6+ KB


In [173]:
train.duplicated().sum()

0

In [174]:
test_data.duplicated().sum()

0

# Data Cleaning / Preprocessing

In [175]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [176]:
train = train.drop(columns=["id"])
test = test_data.drop(columns=["id"])

In [177]:
train["tweet"][0]

'#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone'

In [178]:
train["tweet"][1]

'Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/'

In [179]:
train["label"].value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [180]:
#REMOVING LINKS, TOKENISATION AND REMOVING PUNCTUATIONS

def preprocessing1(text):
    te = re.sub(r'http\S+', "", text)
    tokens = word_tokenize(te)
    
    punctuations = string.punctuation
    txt = []
    for i in tokens:
        if i not in punctuations:
            txt.append(i)
    return txt

In [181]:
train["tweet"] = train["tweet"].apply(preprocessing1)

In [182]:
train["tweet"][0]

['fingerprint',
 'Pregnancy',
 'Test',
 'android',
 'apps',
 'beautiful',
 'cute',
 'health',
 'igers',
 'iphoneonly',
 'iphonesia',
 'iphone']

In [183]:
#REMOVING STOPWORDS & CONVERTING TO LOWERCASE

def preprocessing2(text):
    stopword = stopwords.words("english")
    ntxt = []
    for j in text:
        if j not in stopword:
            ntxt.append(j.lower())
    return ntxt

In [184]:
train["tweet"] = train["tweet"].apply(preprocessing2)

In [185]:
train["tweet"][0]

['fingerprint',
 'pregnancy',
 'test',
 'android',
 'apps',
 'beautiful',
 'cute',
 'health',
 'igers',
 'iphoneonly',
 'iphonesia',
 'iphone']

In [186]:
#STEMMING

def preprocessing3(text):
    stem = []
    Stemmer = PorterStemmer()
    for i in text:
        stem.append(Stemmer.stem(i))
    return " ".join(stem)

In [187]:
train["tweet"] = train["tweet"].apply(preprocessing3)

In [188]:
train["tweet"][0]

'fingerprint pregnanc test android app beauti cute health iger iphoneonli iphonesia iphon'

In [189]:
test["tweet"] = test["tweet"].apply(preprocessing1)
test["tweet"] = test["tweet"].apply(preprocessing2)
test["tweet"] = test["tweet"].apply(preprocessing3)

In [190]:
test["tweet"][0]

"i hate new iphon upgrad wo n't let download app ugh appl suck"

In [191]:
train.head(10)

Unnamed: 0,label,tweet
0,0,fingerprint pregnanc test android app beauti c...
1,0,final transpar silicon case ^^ thank uncl yay ...
2,0,we love would go talk makememori unplug relax ...
3,0,i 'm wire i know i 'm georg i made way iphon c...
4,1,what amaz servic appl wo n't even talk questio...
5,1,iphon softwar updat fuck phone big time stupid...
6,0,happi us .. instap instadaili us soni xperia x...
7,0,new type c charger cabl uk … bay amazon etsi n...
8,0,bout go shop listen music iphon justm music li...
9,0,photo fun selfi pool water soni camera picofth...


In [192]:
test.head(10)

Unnamed: 0,tweet
0,i hate new iphon upgrad wo n't let download ap...
1,current shit fuck pant appl imac cashmoney rad...
2,i 'd like put cd-rom ipad possibl — ye would n...
3,my ipod offici dead i lost pictur video 1d 5so...
4,been fight itun night i want music i paid
5,repost getbakednfri repostapp ・・・ announc appl...
6,thi new appl softwar updat realli thing phone ...
7,babi iphon iphone6 gold new appl appleisbest 6...
8,i 'm confus ... i take time set appoint to sti...
9,fruit tast better pick appl healthi fruit nyc ...


In [193]:
train.isna().sum()

label    0
tweet    0
dtype: int64

In [194]:
test.isna().sum()

tweet    0
dtype: int64

In [195]:
train.shape

(7920, 2)

In [196]:
test.shape

(1953, 1)

# MODEL BUILDING

In [198]:
x = train["tweet"]
y = train["label"]

In [199]:
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(x)

In [200]:
test_transformed = tfidf.transform(test["tweet"])

In [201]:
test_transformed.shape

(1953, 16324)

In [202]:
x.shape

(7920, 16324)

In [67]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=None)

**Logistic Regression**

In [68]:
logr = LogisticRegression()
model = logr.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

85.54292929292929

In [73]:
score = []
for i in range(1000):
    x = train["tweet"]
    y = train["label"]
    x = tfidf.fit_transform(x)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    logr = LogisticRegression()
    logr.fit(x_train, y_train)
    pred = logr.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  167
89.14141414141415


In [74]:
logr = LogisticRegression()
x = train["tweet"]
y = train["label"]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model1 = logr.fit(x_train,y_train)
pred = model1.predict(x_test)
accuracy_score(y_test, pred)*100

89.14141414141415

In [75]:
# Training Data Accuracy

pred = model1.predict(x_train)
accuracy_score(y_train, pred)*100     #Generalized

91.82449494949495

In [76]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod1 = cross_val_score(LogisticRegression(), x_train, y_train, cv=10)
print(mod1)
print(np.round(np.mean(mod1),2)*100)

[0.86277603 0.87066246 0.87223975 0.87539432 0.8659306  0.85804416
 0.86729858 0.87993681 0.84834123 0.8657188 ]
87.0


**Decision Tree Classifier**

In [77]:
dtc = DecisionTreeClassifier(random_state=4)
model = dtc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

82.82828282828282

In [78]:
score = []
for i in range(100):
    x = train["tweet"]
    y = train["label"]
    x = tfidf.fit_transform(x)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    dtc = DecisionTreeClassifier(random_state=4)
    dtc.fit(x_train, y_train)
    pred = dtc.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  13
85.35353535353535


In [81]:
dtc = DecisionTreeClassifier(random_state=4)
x = train["tweet"]
y = train["label"]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model2 = dtc.fit(x_train,y_train)
pred = model2.predict(x_test)
accuracy_score(y_test, pred)*100

85.35353535353535

In [82]:
# Training Data Accuracy

pred = model2.predict(x_train)
accuracy_score(y_train, pred)*100     #Overfitting

99.95265151515152

In [83]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod2 = cross_val_score(DecisionTreeClassifier(random_state=4), x_train, y_train, cv=5)
print(mod2)
print(np.round(np.mean(mod2),2)*100)

[0.83280757 0.83741121 0.83267561 0.82478295 0.81689029]
83.0


**RandomForest Classifier**

In [86]:
rfc = RandomForestClassifier(random_state=4)
model = rfc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

87.81565656565657

In [87]:
score = []
for i in range(100):
    x = train["tweet"]
    y = train["label"]
    x = tfidf.fit_transform(x)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    rfc = RandomForestClassifier(random_state=4)
    rfc.fit(x_train, y_train)
    pred = rfc.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  71
88.25757575757575


In [88]:
rfc = RandomForestClassifier(random_state=4)
x = train["tweet"]
y = train["label"]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model3 = rfc.fit(x_train,y_train)
pred = model3.predict(x_test)
accuracy_score(y_test, pred)*100

88.25757575757575

In [89]:
# Training Data Accuracy

pred = model3.predict(x_train)
accuracy_score(y_train, pred)*100     #Overfitting

99.98421717171718

In [90]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod3 = cross_val_score(RandomForestClassifier(random_state=4), x_train, y_train, cv=10)
print(mod3)
print(np.round(np.mean(mod3),2)*100)

[0.85646688 0.88643533 0.85962145 0.86750789 0.85962145 0.8785489
 0.85624013 0.84834123 0.85781991 0.88467615]
87.0


**Support Vector Classifier**

In [94]:
svc = SVC(kernel="linear",tol=0.1)
model = svc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

89.77272727272727

In [95]:
score = []
for i in range(50):
    x = train["tweet"]
    y = train["label"]
    x = tfidf.fit_transform(x)    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    svc = SVC(kernel="linear",tol=0.1)
    svc.fit(x_train, y_train)
    pred = svc.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  49
90.78282828282829


In [96]:
svc = SVC(kernel="linear",tol=0.1)
x = train["tweet"]
y = train["label"]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model4 = svc.fit(x_train,y_train)
pred = model4.predict(x_test)
accuracy_score(y_test, pred)*100

90.78282828282829

In [97]:
# Training Data Accuracy

pred = model4.predict(x_train)
accuracy_score(y_train, pred)*100     #Generalized

96.32260101010101

In [98]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod4 = cross_val_score(SVC(kernel="linear",tol=0.1), x_train, y_train, cv=4)
print(mod4)
print(np.round(np.mean(mod4),2)*100)

[0.88194444 0.87058081 0.89646465 0.87563131]
88.0


In [207]:
test_pred = model1.predict(test_transformed)
test_pred

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [208]:
submission = pd.DataFrame({"id": test_data["id"], "label": test_pred})

In [209]:
submission.head()

Unnamed: 0,id,label
0,7921,1
1,7922,1
2,7923,1
3,7924,0
4,7925,0


In [210]:
submission.to_csv('Sentiment_Identification_submission.csv', index=False)