# Word Clouds from Covid-19 tweets

In [None]:
import numpy as np 
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

plt.style.use('seaborn-poster')
%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 6

### Explore table

In [None]:
tweets_df = pd.read_csv("covid19_tweets.csv", parse_dates=['user_created', 'date'])
tweets_df.info()

In [None]:
tweets_df.head()

### Detect missing values

In [None]:
tweets_df.isna().sum()

In [None]:
missing_values = pd.DataFrame()
missing_values['column'] = tweets_df.columns

missing_values['percent'] = [round(100* tweets_df[col].isnull().sum() / len(tweets_df), 2) for col in tweets_df.columns]
missing_values = missing_values.sort_values('percent')
missing_values = missing_values[missing_values['percent']>0]

plt.figure(figsize=(15, 5))
sns.set(style='whitegrid', color_codes=True)
splot=sns.barplot(x='column', y='percent', data=missing_values)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center',
                   va = 'center', xytext = (0, 9), textcoords = 'offset points')
plt.xlabel("Column_Name", size=14, weight="bold")
plt.ylabel("Percentage", size=14, weight="bold")
plt.title("Percentage of missing values in column",fontweight="bold",size=17)
plt.show()

### Explore Data Distributions

In [None]:
def plot_count(feature, title, df, size=1, ordered=True):
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    if ordered:
        g = sns.countplot(x=df[feature], order = df[feature].value_counts().index[:20], color='forestgreen')
    else:
        g = sns.countplot(x=df[feature], color='forestgreen')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=12)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2., height + 3, '{:1.2f}%'.format(100*height/total), ha="center") 
    plt.show()

In [None]:
plot_count("user_name", "User name", tweets_df,4)

In [None]:
plot_count("user_location", "User location", tweets_df,4)

In [None]:
plot_count("source", "Source", tweets_df,4)

In [None]:
date_count = tweets_df['date'].dt.date.value_counts().reset_index()
date_count.columns = ['date', 'count']

sns.lineplot(x="date", y="count", data=date_count)
plt.xticks(rotation=30)
plt.show

In [None]:
times = tweets_df['date'][tweets_df['user_location'].isin(["London, England", "London"])]
times = times.groupby([times.dt.hour]).count()
time_count = pd.DataFrame({'time': times.index,
                           'count': times.values})
sns.barplot(x="time", y="count", data=time_count, color='forestgreen')
plt.xticks(rotation=30)
plt.show

#### Which other information you think we should plot to better understand the data?  
  
  
  

### Visualize Word Clouds

In [None]:
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=set(STOPWORDS),
        max_words=50,
        max_font_size=40, 
        scale=5,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(10,10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
show_wordcloud(tweets_df['text'], title = 'Common words in tweets')

In [None]:
london_tweets = tweets_df['text'][tweets_df["user_location"].isin(["London, England", "London"])]
show_wordcloud(london_tweets, title = 'Common words in tweets from London')