![TP](../TeachersPro-logo-color.png)

# Twitter - Topic Modeling

### Lenin Escobar <lenin.escobar@net.teacherspro.com> - Descriptive analytics (18-January-2022)

In [3]:
%config IPCompleter.greedy=True
%load_ext memory_profiler
%matplotlib inline
%run TwitterAnalysisHelper.ipynb
%run TwitterAnalysisGeneralHelper.ipynb
%run TwitterAnalysis_Plot.ipynb

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

<h3 style="background-color:powderblue;">Loading Raw Data</h3>

In [None]:
plottingHelper = PlottingHelper(version = "1.0")

In [None]:
#Connecting to sqlite3 database
#I know I'm using the same data source used for training. However, this data source is really dynamic so,
#we present new rows every time run the backend process to collect new tweets.
#It's also known that we need to train our model from time to time
#(this period is not currently defined so far)
dbConn = Sqlite3Db('data/social_network.db')

In [None]:
dbConn.query('''SELECT insert_timestamp, tweet_timestamp, tweet_term, tweet, place_type, place_name, place_full_name, place_country_code, place_country FROM Tweets; ''')

sqlStmRes = dbConn.cursor.fetchall()
#print(type(sqlStmRes))
#print(sqlStmRes)
df_Tweets_original = pd.DataFrame(sqlStmRes, columns =['insert_timestamp','tweet_timestamp','tweet_term', 'tweet', 'place_type', 'place_name', 'place_full_name', 'place_country_code', 'place_country'])

In [None]:
#Closing sqlite3 datbase connection
dbConn.close()

In [None]:
df_Tweets_original.head(3)

In [None]:
<h3 style="background-color:powderblue;">Cleaning Raw Data</h3>

In [None]:
cleaningHelper = CleaningHelper(version = "1.0")
print(cleaningHelper)

In [None]:
#There should be no null
cleaningHelper.get_nulls_data(df_Tweets_original)

In [None]:
#The original data is untouchable
df_Tweets_mod = df_Tweets_original.copy()

In [None]:
#We are going to check the tweet terms, tweet dates and places at first
df_Tweets_mod['tweet_timestamp_date'] = df_Tweets_mod['tweet_timestamp'].dt.date

In [None]:
df_Tweets_mod['tweet_str'] = df_Tweets_mod['tweet'].str.decode("utf-8")
df_Tweets_mod['tweet_term_str'] = df_Tweets_mod['tweet_term'].str.decode("utf-8")

In [None]:
df_Tweets_mod.dtypes

In [None]:
df_Tweets_mod.head(3)

In [None]:
<h3 style="background-color:powderblue;">Raw Data Plotting</h3>

In [None]:
plottingHelper = PlottingHelper(version = "1.0")

In [None]:
plottingHelper._df_cat = df_Tweets_mod
ipywidgets.interact(plottingHelper.func_count_cat_plotty, \
                x_var_size=ipywidgets.IntSlider(layout={'border': '1px solid black'}, min=1, max=100, value=10, step=1, description="Num.Records"), \
                x_var = ipywidgets.Dropdown(layout={'border': '1px solid black'}, options=["tweet_term_str","tweet_str","place_type","place_name", \
                                                                                           "place_full_name","place_country_code","place_country","tweet_timestamp_date"], description="Feature"), \
                x_var_asc = ipywidgets.Checkbox(layout={'border': '1px solid black'}, value=False, description="Asc. Order"), \
                ax_title = ipywidgets.Textarea(layout={'border': '1px solid black'}, value="Tweets by Feature", description="Title") \
               );

In [None]:
<h3 style="background-color:powderblue;">Cleaning Pre-processed data with Spacy</h3>

In [None]:
#!python -m spacy download en_core_web_lg

In [None]:
# sets of punctuation in variable result 
punctuation_str = string.punctuation  
punctuation_str #I want to know if @ is actually include

In [None]:
#Stop words Set
#stop_words = spacy.lang.en.stop_words.STOP_WORDS
stop_words = cleaningHelper.get_custom_stop_words(spacy_ = spacy_lg)

In [None]:
#English parser object
parser = spacy.lang.en.English()

In [None]:
cleaningTweets = CleaningTweets(version = "1.0", spacy_ = spacy_lg, parser_ = parser, punctuation_str_ = punctuation_str, stop_words_ = stop_words)
print(cleaningTweets)

In [None]:
df_doc_tokens = cleaningTweets.get_words_df(df_Tweets_ = df_Tweets_mod)

In [None]:
df_doc_tokens.info()

In [None]:
df_doc_tokens.head(3)

In [None]:
#df_doc_tokens

In [None]:
df_doc_tokens.groupby(['token_']).token_.value_counts().nlargest(5)

In [None]:
#Top 10 tokens
df_doc_tokens_grp = df_doc_tokens[['token_']].groupby(['token_'])['token_'] \
                             .count() \
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(10)
df_doc_tokens_grp

In [None]:
ls_doc_tokens = cleaningTweets.get_words_list(df_Tweets_ = df_doc_tokens)
#ls_doc_tokens

In [None]:
word_freq = collections.Counter(ls_doc_tokens)
word_freq.most_common(10)

In [None]:
#Instantiate mask
char_mask = np.array(Image.open("covid19.png")) 
#Instantiate the wordcloud object
wc = wordcloud.WordCloud(background_color='white', max_words=300, stopwords=stop_words, collocations=False, max_font_size=40, random_state=42, mask=char_mask)
# Generate word cloud
wc=wc.generate(" ".join(ls_doc_tokens).lower())
# Show word cloud
plt.figure(figsize=(15,15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
<h3 style="background-color:powderblue;">Preparing the corpus</h3>

In [None]:
df_Tweets_mod.head(2)

In [None]:
tqdm.pandas()
df_Tweets_mod["processed_tweet_str"] = df_Tweets_mod["tweet_str"].apply(cleaningTweets.get_token_list)

In [None]:
df_Tweets_mod["processed_tweet_str"]

In [None]:
column_names = ["tweet_str"]
df_clean = pd.DataFrame(columns = column_names)

In [None]:
df_clean = df_clean.append({'tweet_str': 'thank share quote 🌺 🌸'}, ignore_index=True)
df_clean = df_clean.append({'tweet_str': '@testuser'}, ignore_index=True)

In [None]:
df_clean

In [None]:
df_clean["processed_tweet_str"] = df_clean["tweet_str"].apply(cleaningTweets.get_token_list)

In [None]:
df_clean

In [None]:
#Count Vectorizer
#countVectorizer = CountVectorizer(min_df=0.02)
countVectorizer = CountVectorizer()
countVectorizerData = countVectorizer.fit_transform(df_Tweets_mod["processed_tweet_str"])
print(type(countVectorizer))

In [None]:
print(countVectorizer.get_feature_names())

In [None]:
print(countVectorizerData.toarray())

In [None]:
#TF-IDF
vectorizer = TfidfVectorizer(
    #min_df=0.1, 
    #max_df=1.0, 
    stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
#data_vectorized = vectorizer.fit_transform(corpus)
data_vectorized = vectorizer.fit_transform(df_Tweets_mod["processed_tweet_str"])
print(vectorizer.get_feature_names())

In [None]:
print(len(vectorizer.get_feature_names()))

In [None]:
vectorizer.get_feature_names()

In [None]:
data_vectorized.shape

In [None]:
data_vectorized.shape[0]

In [None]:
type(data_vectorized)

In [None]:
data_vectorized.shape

In [None]:
print(data_vectorized.toarray())

In [None]:
data_vectorized_csc = data_vectorized.tocsc(copy=False)

In [None]:
data_vectorized_csc.shape

In [None]:
NUM_TOPICS = 10
NUM_PASSES = 5

In [None]:
#####################
#Working with gensim#
#####################

In [None]:
df_Tweets_mod["processed_tweet_str"]

In [None]:
df_Tweets_mod['processed_tweet_str_tokens'] = df_Tweets_mod["processed_tweet_str"].apply(cleaningTweets.get_tokens)

In [None]:
df_Tweets_mod['processed_tweet_str_tokens']

In [None]:
id2word = Dictionary(df_Tweets_mod['processed_tweet_str_tokens'])
print(len(id2word))

In [None]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

In [None]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df_Tweets_mod['processed_tweet_str_tokens']]

In [None]:
<h3 style="background-color:powderblue;">Loading our pre-trained model</h3>

In [4]:
# Load the Model back from file
Pkl_Filename = "models/Topic_mode_.pkl"

with open(Pkl_Filename, 'rb') as file:
    Pickled_TM_Model = pickle.load(file)

Pickled_TM_Model

EOFError: Ran out of input