In [None]:
import pandas as pd
import numpy as np
import json

## Reading the json file which has captured the real time tweets

In [None]:
with open('india.json', 'rb') as f:
    tweets = f.readlines()
tweets

## Convert the character set into utf-8 and create a list of all tweets

In [None]:
# remove the trailing "\n" from each line
tweetsdf = map( lambda x: x.decode("utf-8").rstrip(), tweets )

In [None]:
data_json_str = "[" + ','.join( tweetsdf ) + "]"
data_json_str

## Read the jsons into a dataframe

In [None]:
tweetsdf = pd.read_json(data_json_str)

In [None]:
tweetsdf.head()

In [None]:
tweetsdf.text[0:10]

## Remove all characters except english alphabets

In [None]:
import re
tweetsdf["text"] = tweetsdf.text.map( lambda comment: re.sub("[^a-zA-Z]", " ", comment ) ) 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

## Lets find the terms and their frequencies

In [None]:
vect = CountVectorizer()
vect.fit( tweetsdf.text )

In [None]:
tweets_vec = vect.fit_transform( tweetsdf.text )

In [None]:
word_freq_df = pd.DataFrame({'term': vect.get_feature_names(), 'tf':np.asarray( tweets_vec.sum( axis=0 ) ).ravel().tolist()})

In [None]:
word_freq_df.info()

In [None]:
word_freq_df.head()

In [None]:
word_freq_df.sort( "tf", ascending = False ).head( 10 )

## Remove all stop words

In [None]:
vect = CountVectorizer( stop_words = "english" )
vect.fit( tweetsdf.text )
tweets_m = vect.fit_transform( tweetsdf.text )
word_freq_df = pd.DataFrame({'term': vect.get_feature_names(), 'tf':np.asarray( tweets_m.sum( axis=0 ) ).ravel().tolist()})
word_freq_df.sort( "tf", ascending = False ).head( 10 )

In [None]:
from sklearn.feature_extraction import text 

## Add to stop words list and create ngrams

In [None]:
additional_stop_words = ['ndtv', 'http', 'https', 'rt', 'india']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)

In [None]:
vect = CountVectorizer( ngram_range=(1, 3), stop_words = stop_words )
vect.fit( tweetsdf.text )
tweets_m = vect.fit_transform( tweetsdf.text )
word_freq_df = pd.DataFrame({'term': vect.get_feature_names(), 'tf':np.asarray( tweets_m.sum( axis=0 ) ).ravel().tolist()})
word_freq_df.sort( "tf", ascending = False ).head( 20 )

## Find TF-IDFs for the terms

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvect = TfidfVectorizer( ngram_range=(1, 2), stop_words = stop_words )
tfidfvect.fit( tweetsdf.text )
tweets_m = tfidfvect.fit_transform( tweetsdf.text )
tfidf_df = pd.DataFrame({'term': tfidfvect.get_feature_names(), 'tfidf':np.asarray( tweets_m.sum( axis=0 ) ).ravel().tolist()})
tfidf_df.sort( "tfidf", ascending = False ).head( 20 )