# Twitter Wordcloud

#### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from wordcloud import WordCloud
import stylecloud
import os

import spacy
from spacy.lang.en import English

import warnings
warnings.filterwarnings('ignore')

In [2]:
nlp = spacy.load("en_core_web_lg")

### 1.  Extracted Tweets File

In [3]:
with open('../data/text_files/extracted_tweets.txt', 'r') as f:
    extracted_tweets = f.read()

####  HashTags Extraction

In [4]:
def hashtags_extraction(string, splitter='\\n'):
    tweets_list = string.split(splitter)
    tweets_splits = [tweet.split(' ') for tweet in tweets_list]
    hashtags_list = [words.split('#')[1] for tweet in tweets_splits for words in tweet if words.startswith('#')]
    hashtags_cleaned = [(words.split('\n')[0]).lower() for words in hashtags_list]
    hashtags_string = " "
    hashtags_string = hashtags_string.join(hashtags_cleaned)
    hashtags_list_clean = []
    for token in nlp(hashtags_string):
        if token.is_punct==False and token.is_stop==False and len(token)>=2 and token.is_alpha==True:
            hashtags_list_clean.append(token.lower_)
    hashtags_string_clean = ' '.join(hashtags_list_clean)
    return hashtags_string_clean

In [5]:
hashtags_string = hashtags_extraction(extracted_tweets)

#### Hashtags Wordcloud

In [7]:
stylecloud.gen_stylecloud(text=hashtags_string,
                          size=2048,
                          icon_name='fab fa-twitter',
                          output_name='../wordclouds/new/extracted_hashtags.png',
                          max_font_size=500,
                          max_words=3000,
                          background_color='white'
                         )

#### Tweets Cleaning

#### adding stopwords into datacleaning

In [8]:
stopwords = ['ubuffalo', 'ubtrueblue', 'ubhornsup', 'ubalumni', 'ubgse', 'ubbulls', 'ubmgt']

In [9]:
nlp.Defaults.stop_words |= set(stopwords)

In [10]:
def data_cleaning(doc):
    cleaned_tweet_words = []
    for token in nlp(doc):
        if token.is_punct==False and token.is_stop==False and token.pos_ in ['NOUN', 'PROPN'] and token.is_alpha==True:
            cleaned_tweet_words.append((token.lemma_).lower())
    cleaned_tweet_string = ' '.join(cleaned_tweet_words)
    return cleaned_tweet_string

In [11]:
cleaned_tweet_string = data_cleaning(extracted_tweets)

#### Tweets Wordcloud

In [12]:
stylecloud.gen_stylecloud(text=cleaned_tweet_string,
                          size=2048,
                          icon_name='fab fa-twitter-square',
                          output_name='../wordclouds/new/extracted_tweets.png')

### 2. Twitter Data from 2000 to March 1 File

In [13]:
with open('../data/text_files/twitter_data_from_2000_to_march_1_#ubalumni.txt', 'r') as f:
    tweets_from_2000 = f.read()

#### Hashtags

In [14]:
alumni_hashtags = hashtags_extraction(string=tweets_from_2000)

In [15]:
stylecloud.gen_stylecloud(text=alumni_hashtags,
                          size=2048,
                          icon_name='fab fa-twitter',
                          output_name='../wordclouds/new/alumni_hashtags.png')

#### Tweets

In [16]:
cleaned_tweet_from_2000 = data_cleaning(tweets_from_2000)

In [17]:
stylecloud.gen_stylecloud(text=cleaned_tweet_from_2000,
                          size=2048,
                          icon_name='fab fa-twitter-square',
                          output_name='../wordclouds/new/tweets_from_2000.png')

### 3. All Tweets

In [18]:
tweets_df = pd.read_csv('../data/combined_files/all_tweets.csv',index_col=0)
tweets_df.head()

Unnamed: 0,Tweet
0,@TheBuffaloNews Great to see the osteointegrat...
1,After an @AANMember study documented the pay g...
2,Join #UBGSE for Black History Nerds Saturday S...
3,Michael Rembis is the director of the Center f...
4,"Thank you, @NeurologyToday, for giving me the ..."


#### Hashtags

In [20]:
hashtags_1 = []
for data in tweets_df['Tweet']:
    for word in data.split(' '):
        if word.startswith('#'):
            hashtags_1.append(word.strip("'#@*&,:;/").lower())

In [21]:
excel_df = pd.read_csv('../data/excel_files/combined_twitter_df.csv', index_col=0)
excel_df.head()

Unnamed: 0,Tweet,Hashtag
0,@TheBuffaloNews Great to see the osteointegrat...,
1,After an @AANMember study documented the pay g...,ubuffalo
2,Join #UBGSE for Black History Nerds Saturday S...,ubgse blackhistorymonth ubuffalo
3,Michael Rembis is the director of the Center f...,
4,"Thank you, @NeurologyToday, for giving me the ...",paygaps genderinequity neurology ubuffalo wome...


In [23]:
hashtags_2 = []

for data in excel_df['Hashtag'].dropna():
    for word in data.split(' '):
        hashtags_2.append(word.strip("'#@*&,:;/").lower())

In [24]:
total_hashtags = hashtags_1+hashtags_2
hashtags_new_excel_string = ' '.join(total_hashtags)

In [31]:
nlp_2 = spacy.load("en_core_web_lg")
final_hashtags_cleaned=[]
for token in nlp_2(hashtags_new_excel_string):
    if token.is_alpha==True and token.is_stop==False and token.is_punct==False and token.is_digit==False and token.is_space==False:
        final_hashtags_cleaned.append(token.lower_)
final_hashtags_string=' '.join(final_hashtags_cleaned)

In [32]:
stylecloud.gen_stylecloud(text=final_hashtags_string,
                          size=2048,
                          icon_name='fab fa-twitter',
                          output_name='../wordclouds/new/all_tweets_hashtags.png')

#### Tweets

In [33]:
tweets_str = ' '.join(tweets_df['Tweet'])

cleaned_new_tweets = data_cleaning(tweets_str[:1000000])   #taking this number because of limit

In [34]:
stylecloud.gen_stylecloud(text=cleaned_new_tweets,
                          size=2048,
                          icon_name='fab fa-twitter-square',
                          output_name='../wordclouds/new/excel_tweets.png')

### Combined Cleaned Data from all Twitter Files

#### Hashtags

In [35]:
final_tags = hashtags_string+alumni_hashtags+final_hashtags_string

In [36]:
with open('../data/combined_files/combined_hashtags.txt', 'w') as f:
    f.write(final_tags)

In [37]:
stylecloud.gen_stylecloud(text=final_tags,
                          size=2048,
                          icon_name='fab fa-twitter',
                          output_name='../wordclouds/new/final_hashtags.png',
                         max_font_size=300,
                         background_color='black')

#### Tweets

In [38]:
final_str = cleaned_tweet_string + cleaned_tweet_from_2000 + cleaned_new_tweets

#### Generating wordcloud for combined tweets

In [39]:
with open('../data/combined_files/combined_tweets.txt', 'w') as f:
    f.write(final_str)

In [40]:
stylecloud.gen_stylecloud(text=final_str,
                          size=2048,
                          icon_name='fab fa-twitter-square',
                          output_name='../wordclouds/new/final_tweets.png')