In [99]:
# Import all required libraries

import nltk
import numpy as np
import re
import pandas as pd 
import pylab as pl
import matplotlib.pyplot as plt

from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn import metrics
from mpl_toolkits.mplot3d import Axes3D

from matplotlib import pyplot

#plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

!pip install datashader
import datashader as ds
import datashader.transfer_functions as tf




In [3]:
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [100]:
# Read the dataset

df = pd.read_csv("gdrive/My Drive/capstone-test/Saved_TweetsConsolidation.csv",encoding='UTF-8')
print(len(df))

76978


In [101]:
# Number of unique tweets
unique_text = df.full_text.unique()
print(len(unique_text))


13310


In [102]:
# Number of unique users
unique_user = df.user.unique()
len(unique_user)

32879

In [103]:
df.head(2)

Unnamed: 0,id,full_text,location,hashtags,created_at,lang,is_retweeted,user
0,1.10182e+18,b'RT @sanghaviharsh: Indians look forward to t...,,,Sat Mar 02 12:36:30 +0000 2019,en,False,9.15179e+17
1,1.10182e+18,b'RT @davidfrawleyved: Why do the leaders of t...,Jaipur,,Sat Mar 02 12:36:27 +0000 2019,en,False,158383100.0


In [104]:
# Number of users with multiple tweets

df["id"].value_counts(ascending=False).head(2)

1.102110e+18    2959
1.102090e+18    2869
Name: id, dtype: int64

In [105]:
# Unique Languages
unique_lang = df.lang.unique()
len(unique_lang)

1

In [106]:
# Number of Unique Locations
unique_location = df.location.unique()
len(unique_location)
df.location.value_counts().head(10)



India               5956
New Delhi, India    1853
Mumbai, India       1396
Mumbai               945
Bengaluru, India     894
New Delhi            803
Pakistan             656
Bangalore            607
भारत                 491
Hyderabad, India     470
Name: location, dtype: int64

In [107]:
# Unique Hashtags
unique_hashtags = df.hashtags.unique()
len(unique_hashtags)


688

In [108]:
df.full_text.count()

76978

In [0]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 

In [0]:
df['Clean_text'] = np.vectorize(remove_pattern)(df['full_text'], "@[\w]*")

In [0]:
# remove special characters, numbers, punctuations
df['Clean_text'] = df['Clean_text'].str.replace("[^a-zA-Z#]", " ")

In [0]:
df['Clean_text'] = df['Clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [0]:
tokenized_tweet = df['Clean_text'].apply(lambda x: x.split())

In [114]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

0    [indian, look, forward, futur, with, renew, se...
1    [leader, opposit, hurl, wild, alleg, promot, c...
2    [report, modi, will, address, public, meet, fo...
3    [report, modi, will, address, public, meet, fo...
4    [believ, india, safe, hand, honor, prime, mini...
Name: Clean_text, dtype: object

In [0]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

df['Clean_text'] = tokenized_tweet

In [63]:
# tok = WordPunctTokenizer()
# pattern1 = r'@[A-Za-z0-9_]+'
# pattern2 = r'https?://[^ ]+'
# combined_pat = r'|'.join((pattern1, pattern2))
# print(combined_pat)

# www_pat = r'www.[^ ]+'
# negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
#                 "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
#                 "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
#                 "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
#                 "mustn't":"must not"}
# neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
# print(neg_pattern)


@[A-Za-z0-9_]+|https?://[^ ]+
re.compile("\\b(isn't|aren't|wasn't|weren't|haven't|hasn't|hadn't|won't|wouldn't|don't|doesn't|didn't|can't|couldn't|shouldn't|mightn't|mustn't)\\b")


In [0]:
# def tweet_cleaner_updated(text):
#     soup = BeautifulSoup(text, 'lxml')   
#     souped = soup.get_text()
   
#     try:
#         bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
#     except:
#         bom_removed = souped
#     stripped = re.sub(combined_pat, '', bom_removed)
#     stripped = re.sub(www_pat, '', stripped)
#     #lower_case = stripped.lower()
#     #neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
#     neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], stripped)
#     letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
#     # During the letters_only process two lines above, it has created unnecessay white spaces,
#     # I will tokenize and join together to remove unneccessary white spaces
#     words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
#     return (" ".join(words)).strip()

In [0]:
# df['Clean_text'] = pd.DataFrame(clean_tweets,columns=['text'])

In [116]:
df.loc[:,('full_text','Clean_text')]

Unnamed: 0,full_text,Clean_text
0,b'RT @sanghaviharsh: Indians look forward to t...,indian look forward futur with renew sens opti...
1,b'RT @davidfrawleyved: Why do the leaders of t...,leader opposit hurl wild alleg promot conspira...
2,b'RT @TimesNow: Report: PM Modi will address a...,report modi will address public meet foundat s...
3,b'RT @TimesNow: Report: PM Modi will address a...,report modi will address public meet foundat s...
4,b'@hvgoenka I believe India is in safe hands o...,believ india safe hand honor prime minist nare...
5,b'PM Narendra Modi says over 1.1 crore farmers...,narendra modi say over crore farmer have kisan...
6,b'RT @DunyaNews: Foreign media exposes Narendr...,foreign media expos narendra modi elect win pr...
7,b'RT @davidfrawleyved: As India under Narendra...,india under narendra modi develop greater sens...
8,"b""RT @IamMjahangeer: Pakistan's Prime Minister...",pakistan prime minist imran khan written lette...
9,b'Narendra Modi and the struggle for India\xe2...,narendra modi struggl india soul http xndi


In [117]:
df[df.full_text.isnull()]


Unnamed: 0,id,full_text,location,hashtags,created_at,lang,is_retweeted,user,Clean_text


In [118]:
df[df.Clean_text.isnull()]

Unnamed: 0,id,full_text,location,hashtags,created_at,lang,is_retweeted,user,Clean_text


In [119]:
# Number of unique tweets
unique_clean_text = df.Clean_text.unique()
unique_full_text = df.full_text.unique()
print(len(unique_clean_text))
print(len(unique_full_text))
print(len(df))

12388
13310
76978


In [0]:
df.drop_duplicates(subset=['Clean_text'], keep = 'first',inplace= True)

In [0]:
df.reset_index(drop=True,inplace=True)

In [122]:
df['Clean_text_length'] = df['Clean_text'].apply(len)
df.head()

Unnamed: 0,id,full_text,location,hashtags,created_at,lang,is_retweeted,user,Clean_text,Clean_text_length
0,1.10182e+18,b'RT @sanghaviharsh: Indians look forward to t...,,,Sat Mar 02 12:36:30 +0000 2019,en,False,9.15179e+17,indian look forward futur with renew sens opti...,73
1,1.10182e+18,b'RT @davidfrawleyved: Why do the leaders of t...,Jaipur,,Sat Mar 02 12:36:27 +0000 2019,en,False,158383100.0,leader opposit hurl wild alleg promot conspira...,77
2,1.10182e+18,b'RT @TimesNow: Report: PM Modi will address a...,PATIALA,,Sat Mar 02 12:36:24 +0000 2019,en,False,125042100.0,report modi will address public meet foundat s...,86
3,1.10182e+18,b'@hvgoenka I believe India is in safe hands o...,,,Sat Mar 02 12:36:20 +0000 2019,en,False,8.69916e+17,believ india safe hand honor prime minist nare...,153
4,1.10182e+18,b'PM Narendra Modi says over 1.1 crore farmers...,,,Sat Mar 02 12:36:19 +0000 2019,en,False,1.05137e+18,narendra modi say over crore farmer have kisan...,88


In [123]:
df[df['Clean_text_length']==0]['Clean_text'] ## Looks like these are tweets with different languages or just hastags.
# We can simply drop these tweets
list = df[df['Clean_text_length']==0]['Clean_text'].index
list

Int64Index([235], dtype='int64')

In [0]:
df.drop(index = list,inplace=True)

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12387 entries, 0 to 12387
Data columns (total 10 columns):
id                   12387 non-null float64
full_text            12387 non-null object
location             8792 non-null object
hashtags             2203 non-null object
created_at           12387 non-null object
lang                 12387 non-null object
is_retweeted         12387 non-null bool
user                 12387 non-null float64
Clean_text           12387 non-null object
Clean_text_length    12387 non-null int64
dtypes: bool(1), float64(2), int64(1), object(6)
memory usage: 979.8+ KB


In [126]:
df.reset_index(drop=True,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12387 entries, 0 to 12386
Data columns (total 10 columns):
id                   12387 non-null float64
full_text            12387 non-null object
location             8792 non-null object
hashtags             2203 non-null object
created_at           12387 non-null object
lang                 12387 non-null object
is_retweeted         12387 non-null bool
user                 12387 non-null float64
Clean_text           12387 non-null object
Clean_text_length    12387 non-null int64
dtypes: bool(1), float64(2), int64(1), object(6)
memory usage: 883.1+ KB


In [0]:
from textblob import TextBlob

In [128]:
TextBlob('Still enjoying the almost nice day out.  Now its down to +13, and calling for rain...').sentiment

Sentiment(polarity=0.3148148148148148, subjectivity=0.6296296296296297)

In [0]:
c=TextBlob("This was because he wasted time listing the failures of Congress prime  ministers. This was unnecessary because they are history and you, Narendra Modi, are the first Prime Minister in 30 years to be given a full mandate by the people of India")

In [0]:
def calculate_sentiment(Clean_text):
    return TextBlob(Clean_text).sentiment

In [0]:
def calculate_sentiment_analyser(Clean_text):    
    return analyser.polarity_scores(Clean_text)

In [132]:
df['sentiment']=df.Clean_text.apply(calculate_sentiment)
df['sentiment_analyser']=df.Clean_text.apply(calculate_sentiment_analyser)


s = pd.DataFrame(index = range(0,len(df)),columns= ['compound_score','compound_score_sentiment'])

for i in range(0,len(df)): 
  s['compound_score'][i] = df['sentiment_analyser'][i]['compound']
  
  if (df['sentiment_analyser'][i]['compound'] <= -0.05):
    s['compound_score_sentiment'][i] = 'Negative'    
  if (df['sentiment_analyser'][i]['compound'] >= 0.05):
    s['compound_score_sentiment'][i] = 'Positive'
  if ((df['sentiment_analyser'][i]['compound'] >= -0.05) & (df['sentiment_analyser'][i]['compound'] <= 0.05)):
    s['compound_score_sentiment'][i] = 'Neutral'
    
df['compound_score'] = s['compound_score']
df['compound_score_sentiment'] = s['compound_score_sentiment']
df.head(4)


Unnamed: 0,id,full_text,location,hashtags,created_at,lang,is_retweeted,user,Clean_text,Clean_text_length,sentiment,sentiment_analyser,compound_score,compound_score_sentiment
0,1.10182e+18,b'RT @sanghaviharsh: Indians look forward to t...,,,Sat Mar 02 12:36:30 +0000 2019,en,False,9.15179e+17,indian look forward futur with renew sens opti...,73,"(-0.04999999999999996, 0.95)","{'neg': 0.0, 'neu': 0.827, 'pos': 0.173, 'comp...",0.3182,Positive
1,1.10182e+18,b'RT @davidfrawleyved: Why do the leaders of t...,Jaipur,,Sat Mar 02 12:36:27 +0000 2019,en,False,158383100.0,leader opposit hurl wild alleg promot conspira...,77,"(0.1, 0.4)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,Neutral
2,1.10182e+18,b'RT @TimesNow: Report: PM Modi will address a...,PATIALA,,Sat Mar 02 12:36:24 +0000 2019,en,False,125042100.0,report modi will address public meet foundat s...,86,"(0.0, 0.06666666666666667)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,Neutral
3,1.10182e+18,b'@hvgoenka I believe India is in safe hands o...,,,Sat Mar 02 12:36:20 +0000 2019,en,False,8.69916e+17,believ india safe hand honor prime minist nare...,153,"(0.3444444444444445, 0.35555555555555557)","{'neg': 0.064, 'neu': 0.581, 'pos': 0.355, 'co...",0.875,Positive


In [0]:
df.to_csv('gdrive/My Drive/capstone-test/unique_clean_tweets_sentiment_21.csv')

In [134]:
df.compound_score_sentiment.value_counts()

Neutral     5557
Negative    3706
Positive    3124
Name: compound_score_sentiment, dtype: int64

In [135]:
df['Clean_text'].head()

0    indian look forward futur with renew sens opti...
1    leader opposit hurl wild alleg promot conspira...
2    report modi will address public meet foundat s...
3    believ india safe hand honor prime minist nare...
4    narendra modi say over crore farmer have kisan...
Name: Clean_text, dtype: object

In [0]:
#tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# Considering 3 grams and mimnimum frq as 0
tf_idf_vect = CountVectorizer(analyzer='word',ngram_range=(1,1),stop_words='english', min_df = 0.0001)
tf_idf_vect.fit(df['Clean_text'])
desc_matrix = tf_idf_vect.transform(df["Clean_text"])


In [0]:
# implement kmeans
num_clusters = 3
km = KMeans(n_clusters=num_clusters)
km.fit(desc_matrix)
clusters = km.labels_.tolist()

In [138]:
# create DataFrame films from all of the input files.
tweets = {'Tweet': df["Clean_text"].tolist(), 'Cluster': clusters}
frame = pd.DataFrame(tweets, index = [clusters])
frame

Unnamed: 0,Cluster,Tweet
1,1,indian look forward futur with renew sens opti...
1,1,leader opposit hurl wild alleg promot conspira...
1,1,report modi will address public meet foundat s...
0,0,believ india safe hand honor prime minist nare...
1,1,narendra modi say over crore farmer have kisan...
1,1,foreign media expos narendra modi elect win pr...
1,1,india under narendra modi develop greater sens...
0,0,pakistan prime minist imran khan written lette...
1,1,narendra modi struggl india soul http xndi
1,1,should worri better sens prevail thi countri p...


In [144]:
frame['Cluster'].value_counts()

1    9963
0    1493
2     931
Name: Cluster, dtype: int64

In [142]:
frame[frame['Cluster'] == 1]

Unnamed: 0,Cluster,Tweet
1,1,indian look forward futur with renew sens opti...
1,1,leader opposit hurl wild alleg promot conspira...
1,1,report modi will address public meet foundat s...
1,1,narendra modi say over crore farmer have kisan...
1,1,foreign media expos narendra modi elect win pr...
1,1,india under narendra modi develop greater sens...
1,1,narendra modi struggl india soul http xndi
1,1,should worri better sens prevail thi countri p...
1,1,abhinandan use mean welcom will chang modi aft...
1,1,hate modi make india hater nthen jasobaden ant...


In [141]:
frame[frame['Cluster'] == 2]

Unnamed: 0,Cluster,Tweet
2,2,sign petit declar narendra modi global terrori...
2,2,sign petit declar narendra modi global terrori...
2,2,sign petit declar narendra modi global terrori...
2,2,sign petit declar narendra modi global terrori...
2,2,sign petit declar narendra modi global terrori...
2,2,sign petit declar narendra modi intern terrori...
2,2,nsign petit declar narendra modi intern terror...
2,2,sign petit declar narendra modi global terrori...
2,2,sign petit declar narendra modi global terrori...
2,2,sign petit declar narendra modi global terrori...
