In [8]:
keyword = "demonetization" 
number = 10000
filename = "demonetization-tweets_Clusters.csv"
file_count = "demonetization-tweets"

#### Step 1: Import data

In [2]:
import pandas as pd 
import os 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
# Identify the encoding of the data file
import chardet
with open('./demonetization-tweets.csv','rb') as f:
    result = chardet.detect(f.read())  #Windows-1252

In [4]:
result

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}

#### Step 2: Clean the tweets

In [22]:
# Import the data file
df = pd.read_csv("./"+file_count+".csv", encoding=result['encoding'])
df = df[1:number+1]
df = df['text']
df = pd.DataFrame({'tweet':df})

In [23]:
df.head()

Unnamed: 0,tweet
1,RT @Hemant_80: Did you vote on #Demonetization...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...
4,RT @satishacharya: Reddy Wedding! @mail_today ...
5,@DerekScissors1: India’s #demonetization: #Bla...


In [24]:
# Clean the tweets
df['cleaned_tweet'] = df['tweet'].replace(r'\'|\"|\,|\.|\?|\+|\-|\/|\=|\(|\)|\n|"', '', regex=True)
df['cleaned_tweet'] = df['cleaned_tweet'].replace("  ", " ")

words_remove = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what", "there","all","we",
                "one","the","a","an","of","or","in","for","by","on","but","is","in","a","not","with","as",
                "was","if","they","are","this","and","it","have","has","from","at","my","be","by","not","that","to",
                "from","com","org","like","likes","so","said","from","what","told","over","more","other",
                "have","last","with","this","that","such","when","been","says","will","also","where","why",
                "would","today", "in", "on", "you", "r", "d", "u", "hw","wat", "oly", "s", "b", "ht", 
                "rt", "p","the","th", "n", "was"]

In [26]:
def cleantext(df, words_to_remove = words_remove): 
    ### dont change the original tweet
    # remove emoticons form the tweets
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'<ed>','', regex = True)
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'\B<U+.*>|<U+.*>\B|<U+.*>','', regex = True)
    
    # convert tweets to lowercase
    df['cleaned_tweet'] = df['cleaned_tweet'].str.lower()
    
    #remove user mentions
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(@\w+)',"", regex=True)
    
    #remove 'rt' in the beginning
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(rt @)',"", regex=True)
    
    #remove_symbols
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'[^a-zA-Z0-9]', " ", regex=True)

    #remove punctuations 
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'[[]!"#$%\'()\*+,-./:;<=>?^_`{|}]+',"", regex = True)

    #remove_URL(x):
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'https.*$', "", regex = True)

    #remove 'amp' in the text
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'amp',"", regex = True)
    
    #remove words of length 1 or 2 
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'\b[a-zA-Z]{1,2}\b','', regex=True)

    #remove extra spaces in the tweet
    df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^\s+|\s+$'," ", regex=True)
     
    
    #remove stopwords and words_to_remove
    stop_words = set(stopwords.words('english'))
    mystopwords = [stop_words, "via", words_to_remove]
    
    df['fully_cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in mystopwords]))
    

    return df

In [27]:
#get the processed tweets
df = cleantext(df)
df.head()

  re.compile(obj)


Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...
5,@DerekScissors1: India’s #demonetization: #Bla...,india demonetization blackmoney symptom ...,india demonetization blackmoney symptom not th...


In [29]:
!pip install textblob

Collecting textblob
  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [30]:
#Sentiment Analysis
from textblob import TextBlob
df['sentiment'] = df['fully_cleaned_tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)  #-1 to 1

In [31]:
df.head()

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0.0
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0.0
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...,0.0
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...,0.0
5,@DerekScissors1: India’s #demonetization: #Bla...,india demonetization blackmoney symptom ...,india demonetization blackmoney symptom not th...,0.0


#### Step 3: Vectorize the tweets

In [32]:
df['tokenized_tweet'] = df['fully_cleaned_tweet'].apply(word_tokenize)
df.head()

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0.0,"[hemant, 80, did, you, vote, demonetization, m..."
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0.0,"[roshankar, former, finsec, rbi, governor, cbd..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...,0.0,"[ani, news, gurugram, haryana, post, office, e..."
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...,0.0,"[satishacharya, reddy, wedding, mail, today, c..."
5,@DerekScissors1: India’s #demonetization: #Bla...,india demonetization blackmoney symptom ...,india demonetization blackmoney symptom not th...,0.0,"[india, demonetization, blackmoney, symptom, n..."


In [33]:
#if a word has a digit, remove that word
df['tokenized_tweet'] = df['tokenized_tweet'].apply(lambda x: [y for y in x if not any(c.isdigit() for c in y)])

In [34]:
# Set values for various parameters
num_features = 100    # Word vector dimensionality                      
min_word_count = 1   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size    

In [35]:
# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(df['tokenized_tweet'], workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)


Training model...


In [52]:
print(model)
model['hemant']

Word2Vec(vocab=7652, size=100, alpha=0.025)


  


array([-0.01849402,  0.11456431,  0.15560867, -0.10890228, -0.09181392,
        0.10000034, -0.04142923,  0.0160463 ,  0.00058721, -0.12933941,
       -0.05741211, -0.11873823,  0.0970791 ,  0.12387466,  0.18476829,
       -0.13852133, -0.04183862,  0.07700722, -0.14404123,  0.08503724,
       -0.11037641,  0.05916216,  0.00923787, -0.05201008, -0.12971872,
        0.012409  ,  0.1403049 , -0.22167183,  0.03262706,  0.0381026 ,
        0.03285928, -0.00338404,  0.02319512, -0.02338156, -0.0084332 ,
        0.00465545,  0.01385034,  0.18238716,  0.150967  , -0.10211366,
       -0.23472099, -0.04840574,  0.05870393, -0.00891721, -0.1449757 ,
        0.02450378,  0.04257541,  0.08848149,  0.01166491, -0.0021478 ,
        0.0930557 ,  0.046371  ,  0.16386081,  0.0321661 , -0.11081693,
        0.08191402,  0.0402762 , -0.07332906,  0.00054731,  0.18398537,
       -0.16737825, -0.21648459,  0.05894341,  0.017719  ,  0.12901856,
       -0.05735436, -0.01162308,  0.11199737, -0.01410391, -0.09

##### Find vector corresponding to each tweet

In [41]:
#Take the average of all word vectors in a tweet
import numpy as np
vocab = list(model.wv.vocab)
def sentence_vector(sentence, model):
    nwords = 0
    featureV = np.zeros(100, dtype="float32")
    for word in sentence:
        if word not in vocab:
            continue
        featureV = np.add(featureV, model[word])
        nwords = nwords + 1
    if nwords > 0: 
        featureV = np.divide(featureV, nwords)
    return featureV

In [42]:
tweet_vector = df['tokenized_tweet'].apply(lambda x: sentence_vector(x, model))  
tweet_vector = tweet_vector.apply(pd.Series)
tweet_vector

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
1,0.005265,0.145178,0.147514,-0.069161,-0.067178,0.079719,-0.050528,0.011328,0.033261,-0.135008,...,-0.086182,0.108031,0.086109,-0.056946,0.025146,0.046597,0.133032,0.105447,0.057718,0.122388
2,-0.050099,-0.009989,0.195613,-0.007635,-0.083550,0.091357,-0.049674,-0.021680,-0.034829,-0.062094,...,-0.140587,0.161284,0.050666,0.013894,0.042011,0.051530,0.102566,0.150345,0.028524,0.308597
3,-0.065409,0.014514,0.209758,0.000943,-0.064745,0.083866,-0.046320,-0.019571,-0.022617,-0.064024,...,-0.125582,0.137329,0.057194,0.022022,0.035088,0.048947,0.075539,0.144824,0.030367,0.327682
4,-0.054286,-0.026542,0.170636,-0.085281,-0.082708,0.103345,-0.053977,-0.038898,-0.012208,-0.031153,...,-0.115264,0.126924,0.035904,-0.019648,0.023975,0.023249,0.082649,0.113520,0.020971,0.251372
5,-0.087156,0.042076,0.164309,0.046300,-0.036794,0.025005,-0.046114,-0.006224,0.010800,-0.072754,...,-0.084652,0.118738,0.055630,0.027680,0.028533,0.002835,0.052109,0.127632,0.015594,0.242863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0.057474,0.000632,0.171483,-0.074609,-0.083344,0.101339,-0.076492,-0.041278,0.003894,-0.057157,...,-0.110283,0.149861,0.055882,-0.018538,-0.026981,0.070641,0.145886,0.102768,0.080955,0.206910
9997,-0.012201,0.058777,0.079949,-0.101394,-0.096331,0.066315,-0.058465,0.018600,0.008244,-0.152532,...,-0.130359,0.095423,0.026217,-0.077861,0.057959,0.055100,0.100299,0.104382,0.036243,0.111875
9998,0.057474,0.000632,0.171483,-0.074609,-0.083344,0.101339,-0.076492,-0.041278,0.003894,-0.057157,...,-0.110283,0.149861,0.055882,-0.018538,-0.026981,0.070641,0.145886,0.102768,0.080955,0.206910
9999,0.057474,0.000632,0.171483,-0.074609,-0.083344,0.101339,-0.076492,-0.041278,0.003894,-0.057157,...,-0.110283,0.149861,0.055882,-0.018538,-0.026981,0.070641,0.145886,0.102768,0.080955,0.206910


In [63]:
#Tweet vector should vary from 0 to 1 (normalize the vector)
for x in range(len(tweet_vector)):
    x_min = tweet_vector.iloc[x].min()
    x_max = tweet_vector.iloc[x].max()
    X  = tweet_vector.iloc[x]
    i = 0
    if (x_max - x_min) == 0:
        for y in X:
            tweet_vector.iloc[x][i] = (1/len(tweet_vector.iloc[x]))
            i = i + 1
    else:
        for y in X:
            tweet_vector.iloc[x][i] = ((y - x_min)/(x_max - x_min))
            i = i + 1


In [64]:
print(tweet_vector)

             0         1         2         3         4         5         6   \
1      0.525097  0.843612  0.848932  0.355665  0.360180  0.694594  0.398084   
2      0.324409  0.399955  0.787198  0.404388  0.261405  0.590836  0.325210   
3      0.293233  0.436933  0.787976  0.412532  0.294428  0.561625  0.327554   
4      0.300143  0.359965  0.785131  0.233310  0.238858  0.640036  0.300809   
5      0.248238  0.542621  0.821058  0.552243  0.362961  0.503734  0.341729   
...         ...       ...       ...       ...       ...       ...       ...   
9996   0.583541  0.443594  0.864232  0.258350  0.236844  0.691535  0.253716   
9997   0.470926  0.637027  0.686573  0.262201  0.274049  0.654666  0.362662   
9998   0.583541  0.443594  0.864232  0.258350  0.236844  0.691535  0.253716   
9999   0.583541  0.443594  0.864232  0.258350  0.236844  0.691535  0.253716   
10000  0.385479  0.477486  0.858521  0.640531  0.462368  0.696598  0.320735   

             7         8         9   ...        90 

#### Step 4: Add sentiment to the tweet vector

In [65]:
#Scale the 'sentiment' vector
#Sentiment varies from -1 to +1

def sentiment(x):
    if x < 0.04:
        return 0
    elif x > 0.04:
        return 1
    else:
        return 0.5

tweet_vector[100] = df['sentiment'].apply(lambda x: sentiment(x))

In [66]:
tweet_vector  #sentiment 0 to +1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
1,0.525097,0.843612,0.848932,0.355665,0.360180,0.694594,0.398084,0.538902,0.588831,0.205763,...,0.759048,0.709141,0.383472,0.570357,0.619191,0.815963,0.753164,0.644509,0.791732,0
2,0.324409,0.399955,0.787198,0.404388,0.261405,0.590836,0.325210,0.377935,0.353170,0.301818,...,0.722542,0.514196,0.444938,0.497895,0.515823,0.611949,0.701938,0.472493,1.000000,0
3,0.293233,0.436933,0.787976,0.412532,0.294428,0.561625,0.327554,0.375649,0.370172,0.295723,...,0.657749,0.513670,0.450432,0.473924,0.498841,0.546655,0.671225,0.465436,1.000000,0
4,0.300143,0.359965,0.785131,0.233310,0.238858,0.640036,0.300809,0.333322,0.390874,0.350023,...,0.690877,0.494616,0.374831,0.468893,0.467329,0.595409,0.661975,0.462416,0.959218,0
5,0.248238,0.542621,0.821058,0.552243,0.362961,0.503734,0.341729,0.432595,0.471377,0.281046,...,0.717251,0.573496,0.509828,0.511771,0.453232,0.565474,0.737511,0.482297,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0.583541,0.443594,0.864232,0.258350,0.236844,0.691535,0.253716,0.340412,0.451626,0.301318,...,0.810997,0.579620,0.396399,0.375612,0.615956,0.801211,0.695054,0.641351,0.951453,1
9997,0.470926,0.637027,0.686573,0.262201,0.274049,0.654666,0.362662,0.543005,0.518771,0.142528,...,0.722784,0.560831,0.317271,0.635111,0.628423,0.734195,0.743750,0.584293,0.761286,1
9998,0.583541,0.443594,0.864232,0.258350,0.236844,0.691535,0.253716,0.340412,0.451626,0.301318,...,0.810997,0.579620,0.396399,0.375612,0.615956,0.801211,0.695054,0.641351,0.951453,1
9999,0.583541,0.443594,0.864232,0.258350,0.236844,0.691535,0.253716,0.340412,0.451626,0.301318,...,0.810997,0.579620,0.396399,0.375612,0.615956,0.801211,0.695054,0.641351,0.951453,1


In [67]:
#Updating the 'sentiment' column in df also
df['sentiment'] = tweet_vector[100]

### Step 5: Cluster the narratives [= opinions + expressions]

In [68]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

In [69]:
range_n_clusters = [4, 5, 6, 7, 8, 9, 10, 11]
X = tweet_vector
n_best_clusters = 0
silhouette_best = 0

In [70]:
for n_clusters in range_n_clusters:
    
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
                                      #, sample_size = 5000)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    
    if silhouette_avg > silhouette_best:
        silhouette_best = silhouette_avg
        n_best_clusters = n_clusters

For n_clusters = 4 The average silhouette_score is : 0.2359700194167453
For n_clusters = 5 The average silhouette_score is : 0.2507793763173987
For n_clusters = 6 The average silhouette_score is : 0.25837742981133377
For n_clusters = 7 The average silhouette_score is : 0.2694055595261788
For n_clusters = 8 The average silhouette_score is : 0.29659779189975005
For n_clusters = 9 The average silhouette_score is : 0.30308841567776906
For n_clusters = 10 The average silhouette_score is : 0.3037191199973398
For n_clusters = 11 The average silhouette_score is : 0.30266046984779454


In [71]:
clusterer = KMeans(n_clusters= n_best_clusters , random_state=10)
cluster_labels = clusterer.fit_predict(X)

In [91]:
np.unique(cluster_labels)  

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [72]:
#Array of tweets, the corresponding cluster number, sentiment
finaldf = pd.DataFrame({'cl_num': cluster_labels,'fully_cleaned_tweet': df['fully_cleaned_tweet'], 'cleaned_tweet': df['cleaned_tweet'], 'tweet': df['tweet'],'sentiment': df['sentiment']})
finaldf = finaldf.sort_values(by=['cl_num'])

In [73]:
df['cl_num'] = cluster_labels

In [74]:
df

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num
1,RT @Hemant_80: Did you vote on #Demonetization...,hemant 80 did you vote demonetization modi...,hemant 80 did you vote demonetization modi sur...,0,"[hemant, did, you, vote, demonetization, modi,...",3
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",roshankar former finsec rbi governor cbdt ch...,roshankar former finsec rbi governor cbdt chai...,0,"[roshankar, former, finsec, rbi, governor, cbd...",1
3,RT @ANI_news: Gurugram (Haryana): Post office ...,ani news gurugram haryana post office employ...,ani news gurugram haryana post office employee...,0,"[ani, news, gurugram, haryana, post, office, e...",1
4,RT @satishacharya: Reddy Wedding! @mail_today ...,satishacharya reddy wedding mail today cart...,satishacharya reddy wedding mail today cartoon...,0,"[satishacharya, reddy, wedding, mail, today, c...",7
5,@DerekScissors1: India’s #demonetization: #Bla...,india demonetization blackmoney symptom ...,india demonetization blackmoney symptom not th...,0,"[india, demonetization, blackmoney, symptom, n...",6
...,...,...,...,...,...,...
9996,"RT @AdityaNair20: First time in 29years, our M...",adityanair20 first time 29years our milkman ...,adityanair20 first time 29years our milkman as...,1,"[first, time, our, milkman, asked, for, check,...",0
9997,RT @URautelaForever: Dear @evanspiegel \r\nInd...,urautelaforever dear evanspiegel india ri...,urautelaforever dear evanspiegel india rich th...,1,"[urautelaforever, dear, evanspiegel, india, ri...",0
9998,"RT @AdityaNair20: First time in 29years, our M...",adityanair20 first time 29years our milkman ...,adityanair20 first time 29years our milkman as...,1,"[first, time, our, milkman, asked, for, check,...",0
9999,"RT @AdityaNair20: First time in 29years, our M...",adityanair20 first time 29years our milkman ...,adityanair20 first time 29years our milkman as...,1,"[first, time, our, milkman, asked, for, check,...",0


In [75]:
dfOrdered = pd.DataFrame(df)

#Compute how many times a tweet has been 'retweeted' - that is, how many rows in dfOrdered are identical
dfOrdered['tokenized_tweet'] = dfOrdered['tokenized_tweet'].apply(tuple)
dfUnique = dfOrdered.groupby(['tweet', 'cleaned_tweet', 'fully_cleaned_tweet', 'sentiment','tokenized_tweet', 'cl_num']).size().reset_index(name="freq")
dfUnique = dfUnique.sort_values(by=['cl_num'])

In [76]:
dfUnique['tokenized_tweet'] = dfUnique['tokenized_tweet'].apply(list)
dfOrdered['tokenized_tweet'] = dfOrdered['tokenized_tweet'].apply(list)

In [77]:
dfUnique

Unnamed: 0,tweet,cleaned_tweet,fully_cleaned_tweet,sentiment,tokenized_tweet,cl_num,freq
2307,RT @SwachhPolitics: #Demonetization\r\nCan't b...,swachhpolitics demonetization cant believe p...,swachhpolitics demonetization cant believe peo...,1,"[swachhpolitics, demonetization, cant, believe...",0,3
1711,"Months after #DeMonetization, many #ATMs conti...",months after demonetization many atms contin...,months after demonetization many atms continue...,1,"[months, after, demonetization, many, atms, co...",0,1
1710,"Modi’s Demonetization was not a Failure, prove...",modi demonetization was not failure proves m...,modi demonetization was not failure proves man...,1,"[modi, demonetization, was, not, failure, prov...",0,1
1705,Modi's #DeMonetization was actually a strategy...,modis demonetization was actually strategy ...,modis demonetization was actually strategy bri...,1,"[modis, demonetization, was, actually, strateg...",0,1
2751,RT @rashmitambe: What exactly is problem in ad...,rashmitambe what exactly problem addressing...,rashmitambe what exactly problem addressing pa...,1,"[rashmitambe, what, exactly, problem, addressi...",0,1
...,...,...,...,...,...,...,...
1332,"Everyone seems to hate the rich, even the rich...",everyone seems hate the rich even the rich ha...,everyone seems hate the rich even the rich hat...,0,"[everyone, seems, hate, the, rich, even, the, ...",9,1
1175,By the time the Opposition finishes with its n...,the time the opposition finishes with its nau...,the time the opposition finishes with its naut...,0,"[the, time, the, opposition, finishes, with, i...",9,1
2798,RT @saxenask352: 95% sacrificing to cleanse 5%...,saxenask352 95 sacrificing cleanse 5 benef...,saxenask352 95 sacrificing cleanse 5 benefits ...,0,"[sacrificing, cleanse, benefits, demonetizatio...",9,1
2981,Shiva Sena and SAD fall in line with the gover...,shiva sena and sad fall line with the governm...,shiva sena and sad fall line with the governme...,0,"[shiva, sena, and, sad, fall, line, with, the,...",9,1


#### Discard the clusters with poor Silhouette score

In [78]:
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)

In [82]:
sample_silhouette_values

array([0.38434295, 0.49248217, 0.52784359, ..., 0.20879619, 0.20879619,
       0.4797031 ])

In [89]:
poor_cluster_indices = []
avg_cluster_sil_score = []

for i in range(n_best_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        avgscore = (np.mean(ith_cluster_silhouette_values))   #average silhouette score for each cluster
        avg_cluster_sil_score = np.append(avg_cluster_sil_score, avgscore)
        print('Cluster',i, ':', avgscore)
        if avgscore < 0.2:
            poor_cluster_indices = np.append(poor_cluster_indices, i)
            
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]

Cluster 0 : 0.17641072213642167
Cluster 1 : 0.3394631070186884
Cluster 2 : 0.3974449093377233
Cluster 3 : 0.1680751858026234
Cluster 4 : 0.24000622092279547
Cluster 5 : 0.5386852489147615
Cluster 6 : 0.0500566773070515
Cluster 7 : 0.3746520006627592
Cluster 8 : 0.3760914784156285
Cluster 9 : 0.5527857710094498


In [92]:
poor_cluster_indices

array([0., 3., 6.])

In [93]:
#remove those rows where cluster value match poor_cluster_indices 
avg_cluster_sil_score_final = []
cluster_name = np.unique(dfOrdered['cl_num'])

if (len(poor_cluster_indices)!=0):
    n_final_clusters = n_best_clusters - len(poor_cluster_indices)
    for i in poor_cluster_indices:
        dfUnique = dfUnique[dfUnique['cl_num'] != i]
    for j in cluster_name:
        if j not in poor_cluster_indices:    
            avg_cluster_sil_score_final = np.append(avg_cluster_sil_score_final, avg_cluster_sil_score[j])
            
    cluster_name = np.unique(dfUnique['cl_num'])
    

In [94]:
dfUnique['cl_num'] = abs(dfUnique['cl_num'])
dfUnique = dfUnique.sort_values(by=['cl_num'])

#### Step 6: Calculate abstraction and expression for each narrative

Note that each cluster represents a narrative
