# Twitter Sentiment Analysis 

In [1]:
# data link: https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv')
df

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0
...,...,...
29995,@Calumfan1 is it in any way related to photosh...,0
29996,@Swiz_NZ really? wow thats crap,0
29997,"At the 2010 lexus HS250h press event. Again, ...",0
29998,@karmicunderpath ooooh now there's a nice thou...,1


In [5]:
df['sentiment'].value_counts()

1    15000
0    15000
Name: sentiment, dtype: int64

## SVM Model and Data Preparation 

In [6]:
def run_svm(df):
    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

    print('shape of X: ', X.shape)

    clf = LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))
    
    return tfidf, clf

In [7]:
%%time
tfidf, clf = run_svm(df)

shape of X:  (30000, 40854)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

Wall time: 1.53 s


In [8]:
x = ['i am really happy. thanks a lot for coming with me']

In [9]:
clf.predict(tfidf.transform(x))

array([1], dtype=int64)

## Data Cleaning and Retraining SVM 

In [10]:
# Use our preprocess python package

In [11]:
import preprocess_kgptalkie as pp

In [12]:
pp.__version__

'0.0.3'

In [13]:
df['twitts'] = df['twitts'].apply(lambda x: x.lower())

In [14]:
df['twitts'] = df['twitts'].apply(lambda x: pp.cont_exp(x))

In [15]:
df

Unnamed: 0,twitts,sentiment
0,@robbiebronniman sounds like a great night.,1
1,damn the person who stolde my wallet !!!!! ma...,1
2,greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars pretty pretty pretty please, pak...",0
...,...,...
29995,@calumfan1 is it in any way related to photosh...,0
29996,@swiz_nz really? wow thats crap,0
29997,"at the 2010 lexus hs250h press event. again, ...",0
29998,@karmicunderpath ooooh now there is a nice tho...,1


In [16]:
run_svm(df)

shape of X:  (30000, 40846)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.75      0.76      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



(TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=None,
                 min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words=None, strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
           verbose=0))

In [17]:
# remove emails and urls

df['twitts'] = df['twitts'].apply(lambda x: pp.remove_emails(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_urls(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_rt(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_html_tags(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_special_chars(x))


In [18]:
tfidf, clf = run_svm(df)

shape of X:  (30000, 42931)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



In [19]:
x

['i am really happy. thanks a lot for coming with me']

In [20]:
clf.predict(tfidf.transform(x))

array([1], dtype=int64)

## Fine Tuning Model

In [21]:
df

Unnamed: 0,twitts,sentiment
0,robbiebronniman sounds like a great night,1
1,damn the person who stolde my wallet may karma...,1
2,greetings from the piano bench photo,1
3,drewryanscott i love it i love you haha forget...,1
4,kissthestars pretty pretty pretty please pakid...,0
...,...,...
29995,calumfan1 is it in any way related to photoshop,0
29996,swiz_nz really wow thats crap,0
29997,at the 2010 lexus hs250h press event again can...,0
29998,karmicunderpath ooooh now there is a nice thought,1


In [22]:
def run_svm(df):
    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer(norm = 'l1', ngram_range=(1,2), analyzer='word', max_features=5000)
    X = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

    print('shape of X: ', X.shape)

    clf = LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))
    
    return tfidf, clf

run_svm(df)

shape of X:  (30000, 5000)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      3000
           1       0.77      0.75      0.76      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



(TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=5000,
                 min_df=1, ngram_range=(1, 2), norm='l1', preprocessor=None,
                 smooth_idf=True, stop_words=None, strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
           verbose=0))

## Saving and Loading ML Model 

In [23]:
import pickle

In [24]:
pickle.dump(clf, open('clf.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))

In [25]:
del clf
del tfidf

In [26]:
clf = pickle.load(open('clf.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

In [27]:
clf

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [28]:
# tfidf.vocabulary_

In [29]:
x

['i am really happy. thanks a lot for coming with me']

In [30]:

clf.predict(tfidf.transform(x))

array([1], dtype=int64)

# Real-Time Twitter Sentiment Analysis 

In [17]:
consumer_key = 'R7DGimRNkT11sbngA0MRqLmNE'
consumer_secret = 'w5Axtw43feejwgmPIhqPhPOt1aHso1Guw1yuFwlmijtlh0vguK'
access_token = '1279486577656295425-l3gaKqKuHQdKl44rPXUc0WYcc26wgq'
access_token_secret = '80dGAdcx6LuoWM1mSt669V5NESP0EOuX1dK8Mianjqxi2'

In [18]:
# !pip install tweepy

In [19]:
import tweepy

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

public_tweets = api.home_timeline()


In [37]:
type(public_tweets)

tweepy.models.ResultSet

In [44]:
public_tweets[0].text

'RT @LittleFrenchKev: AI air hockey battle ! no training with hardware fully sim to real .\n\n#mecatronic #mecatronics #robotic #robotics #rob…'

In [45]:
for tweet in public_tweets:
    print(tweet.text)

RT @LittleFrenchKev: AI air hockey battle ! no training with hardware fully sim to real .

#mecatronic #mecatronics #robotic #robotics #rob…
RT @zeldia123: 8 Advanced python tricks💎
See if there is anything new for you to learn ?
#100DaysOfMLCode #100DaysOfCode #DataScience #Pyth…
RT @quaicoor: Please what specific topics in Multivariable calculus and Linear Algebra are needed for machine learning? I need help.
#100Da…
How to Transform Target Variables for Regression in Python https://t.co/Op9uHbFie5
RT @pierrepinna: Rejoignez-moi ce mercredi 8 juillet en direct pour discuter d'Intelligence Artificielle dans le #Ecommerce &amp; Retail.
#1to1…
RT @facebookai: We’re sharing the first analysis of photo sharing's impact on heritage tourism. We used #computervision to extract insights…
RT @gold_ochim: Day [18/100]
Still on DSN AI+ classes...
Modeling has begun.... Linear Regression.
#100DaysOfCode #CodeNewbie #Python #Mach…
RT @charlyrere: Enterprise Big Data Professional Certification Cour

## Tracking Keywords on Twitter 

In [14]:
import json
import pickle
import tweepy

In [5]:
from textblob import TextBlob

In [6]:
import preprocess_kgptalkie as pp

In [7]:
clf = pickle.load(open('clf.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

In [8]:
def predict_sentiment(x):
    x = [x]
    sent = clf.predict(tfidf.transform(x))
    return sent

In [12]:
predict_sentiment('what i am not the best')[0]

0

In [22]:
track_keyword = ['usa', 'china']

In [24]:
usa = 0
china = 0

class MyStreamListener(tweepy.StreamListener):
    
    def on_status(self, status):
        print(status.text)
    
    def on_data(self, data):
        raw_twitts = json.loads(data)
        try:
            x = str(raw_twitts['text']).lower()
            x = pp.cont_exp(x)
            x = pp.remove_emails(x)
            x = pp.remove_html_tags(x)
            x = pp.remove_rt(x)
            x = pp.remove_special_chars(x)
            x = pp.remove_urls(x)
            
#             blob = TextBlob(x)
#             sentiment = predict_sentiment(x)[0]
#             print(sentiment)
            
            global china
            global usa
            
            if 'usa' in x and 'china' not in x:
                sent = predict_sentiment(x)[0]
                usa = usa + sent
                
            elif 'china' in x and 'usa' not in x:
                sent = predict_sentiment(x)[0]
                china = china + sent
                
            else:
                pass
            
            print('usa: ', usa, 'china: ', china)
            
            
        except:
            pass
        
        
        
    def on_error(self, status_code):
        if status_code == 420:
            print('Error 420')
            #returning False in on_error disconnects the stream
            return False

In [25]:
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)

In [None]:
myStream.filter(track=track_keyword)

usa:  0 china:  0
usa:  1 china:  0
usa:  2 china:  0
usa:  2 china:  0
usa:  2 china:  0
usa:  2 china:  0
usa:  2 china:  0
usa:  2 china:  0
usa:  2 china:  0
usa:  2 china:  1
usa:  2 china:  2
usa:  2 china:  2
usa:  3 china:  2
usa:  3 china:  2
usa:  3 china:  2
usa:  4 china:  2
usa:  5 china:  2
usa:  5 china:  2
usa:  5 china:  2
usa:  5 china:  2
usa:  5 china:  3
usa:  5 china:  3
usa:  6 china:  3
usa:  6 china:  3
usa:  6 china:  3
usa:  6 china:  3
usa:  6 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  4
usa:  7 china:  5
usa:  8 china:  5
usa:  8 china:  5
usa:  8 china:  5
usa:  8 china:  6
usa:  8 china:  6
usa:  8 china:  7
usa:  8 china:  7
usa:  8 china:  7
usa:  8 china:  8
usa:  8 china:  8
usa:  8 china:  8
usa:  8 china:  8
usa:  8 china:  8
usa:  9 china:  8
usa:  9 china:  8
usa:  10 c

usa:  66 china:  41
usa:  66 china:  41
usa:  66 china:  41
usa:  66 china:  41
usa:  67 china:  41
usa:  67 china:  41
usa:  67 china:  41
usa:  67 china:  41
usa:  67 china:  42
usa:  67 china:  42
usa:  67 china:  42
usa:  67 china:  42
usa:  67 china:  42
usa:  67 china:  42
usa:  67 china:  42
usa:  67 china:  42
usa:  67 china:  42
usa:  67 china:  42
usa:  68 china:  42
usa:  68 china:  42
usa:  68 china:  42
usa:  68 china:  42
usa:  68 china:  42
usa:  68 china:  42
usa:  69 china:  42
usa:  69 china:  42
usa:  69 china:  42
usa:  69 china:  42
usa:  69 china:  42
usa:  70 china:  42
usa:  70 china:  42
usa:  70 china:  43
usa:  70 china:  43
usa:  70 china:  43
usa:  70 china:  43
usa:  70 china:  43
usa:  71 china:  43
usa:  71 china:  43
usa:  71 china:  43
usa:  71 china:  43
usa:  71 china:  43
usa:  71 china:  43
usa:  71 china:  43
usa:  71 china:  43
usa:  72 china:  43
usa:  72 china:  43
usa:  72 china:  43
usa:  72 china:  44
usa:  72 china:  44
usa:  72 china:  44


usa:  132 china:  97
usa:  133 china:  97
usa:  134 china:  97
usa:  134 china:  97
usa:  134 china:  97
usa:  134 china:  98
usa:  134 china:  98
usa:  135 china:  98
usa:  135 china:  98
usa:  135 china:  98
usa:  135 china:  98
usa:  135 china:  98
usa:  135 china:  98
usa:  135 china:  98
usa:  136 china:  98
usa:  137 china:  98
usa:  137 china:  99
usa:  137 china:  99
usa:  137 china:  99
usa:  137 china:  99
usa:  137 china:  99
usa:  137 china:  99
usa:  137 china:  99
usa:  137 china:  99
usa:  137 china:  100
usa:  137 china:  100
usa:  137 china:  100
usa:  137 china:  100
usa:  137 china:  100
usa:  138 china:  100
usa:  138 china:  100
usa:  138 china:  100
usa:  138 china:  100
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 china:  101
usa:  138 ch