In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv('.\data\\twitter_data30k.csv')

In [3]:
data.head()

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0


In [4]:
data['sentiment'].value_counts()

1    15000
0    15000
Name: sentiment, dtype: int64

In [25]:
def svm_pred(data):
    X = data['twitts']
    y = data['sentiment']

    tfidf = TfidfVectorizer(norm='l1')
    X=tfidf.fit_transform(X)

    X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=0, stratify=y)

    print('Shape of X: ',X.shape )

    clf = LinearSVC()
    clf.fit(X_train,y_train)

    y_pred = clf.predict(X_test)

    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))
    
    return tfidf, clf


In [26]:
%%time
tfidf,clf = svm_pred(data)

Shape of X:  (30000, 43007)

Printing Report
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      3000
           1       0.76      0.76      0.76      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000

Wall time: 1.03 s


In [27]:
mysent = ['Eating fats is bad']

mypred = clf.predict(tfidf.transform(mysent))
mypred
# 1 means positive sentiment

array([0], dtype=int64)

In [28]:
print(np.where(mypred==1,'Its a positive sentiment','Its a negative sentiment'))

['Its a negative sentiment']


In [29]:
import textpreprocess_2022 as pp

In [30]:
pp.__version__

'0.0.1'

In [31]:
data['twitts'] =data['twitts'].apply(lambda x: x.lower())

In [32]:
data['twitts']

0                robbiebronniman sounds like a great night
1        damn the person who stolde my wallet may karma...
2                     greetings from the piano bench photo
3        drewryanscott i love it i love you haha forget...
4        kissthestars pretty pretty pretty please pakid...
                               ...                        
29995      calumfan1 is it in any way related to photoshop
29996                        swiz_nz really wow thats crap
29997    at the 2010 lexus hs250h press event again can...
29998    karmicunderpath ooooh now there is a nice thought
29999    mariap91 id usually ask you about the sun and ...
Name: twitts, Length: 30000, dtype: object

In [33]:
data['twitts']=data['twitts'].apply(lambda x: pp.cont_ext(x))

In [34]:
svm_pred(data)

Shape of X:  (30000, 43007)

Printing Report
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      3000
           1       0.76      0.76      0.76      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



(TfidfVectorizer(norm='l1'), LinearSVC())

In [35]:
data['twitts']=data['twitts'].apply(lambda x: pp.remove_emails(x))
data['twitts']=data['twitts'].apply(lambda x: pp.remove_urls(x))
data['twitts']=data['twitts'].apply(lambda x: pp.remove_rt(x))
data['twitts']=data['twitts'].apply(lambda x: pp.remove_html_tags(x))
data['twitts']=data['twitts'].apply(lambda x: pp.remove_special_chars(x))


In [36]:
#pp.remove_emails('Hello nitin contact@gmail.com, is my mail id. New mail id is contactme@acb.com')

In [37]:
tfidf, clf = svm_pred(data)

Shape of X:  (30000, 43007)

Printing Report
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      3000
           1       0.76      0.76      0.76      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



## Saving & Loading the Model

In [38]:
import pickle

In [39]:
# wb - write binary mode
pickle.dump(clf, open('clf.pkl','wb'))
pickle.dump(tfidf,open('tfidf.pkl','wb'))

In [40]:
tfidf

TfidfVectorizer(norm='l1')

In [41]:
clf

LinearSVC()

In [42]:
del tfidf

In [43]:
del clf

In [47]:
# rb - read binary mode
clf = pickle.load(open('clf.pkl','rb'))
tfidf = pickle.load(open('tfidf.pkl','rb'))

In [48]:
clf

LinearSVC()

In [49]:
tfidf

TfidfVectorizer(norm='l1')

In [50]:
# How many times below words are occcured in our dataframe or data
tfidf.vocabulary_

{'robbiebronniman': 31627,
 'sounds': 34837,
 'like': 21885,
 'great': 15496,
 'night': 26468,
 'damn': 9485,
 'the': 37224,
 'person': 28602,
 'who': 41168,
 'stolde': 35583,
 'my': 25714,
 'wallet': 40502,
 'may': 23738,
 'karma': 20160,
 'come': 8217,
 'back': 3763,
 'and': 2511,
 'bite': 4889,
 'you': 42492,
 'in': 17928,
 'ass': 3274,
 'greetings': 15547,
 'from': 14248,
 'piano': 28818,
 'bench': 4526,
 'photo': 28775,
 'drewryanscott': 11310,
 'love': 22562,
 'it': 18503,
 'haha': 15891,
 'forget': 13921,
 'hugyou': 17339,
 'should': 33661,
 'give': 14998,
 'me': 23859,
 'kissno': 20736,
 'lie': 21817,
 'please': 29115,
 'would': 41743,
 'be': 4210,
 'awesome': 3577,
 'if': 17691,
 'did': 10484,
 'kissthestars': 20737,
 'pretty': 29651,
 'pakidownload': 28017,
 'ito': 18553,
 'then': 37396,
 'reupload': 31335,
 'someother': 34649,
 'site': 34005,
 'mediafire': 23903,
 'hindi': 16816,
 'mgwork': 24249,
 'ang': 2585,
 'mu': 25526,
 'skin': 34067,
 'really': 30702,
 'upset': 39718,

In [52]:
mysent = ['Eating fats is good']

mypred = clf.predict(tfidf.transform(mysent))
mypred

array([1], dtype=int64)

In [53]:
import re

In [65]:
text = 'II llooooveeee WWWWWaaaaaaatching TV'

In [66]:
x = re.sub("(.)\\1{2,}","\\1",text)
x

'II llove Watching TV'