# Disaster Tweets EDA

This notebook contains basic and/or complicated EDA.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import string

from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# set color codes
sns.set_color_codes('pastel')
plt.style.use('ggplot')
stop_words = set(stopwords.words('english'))


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

### Loading the dataset

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
print('Number of samples in train set: {}'.format(train.shape[0]))
print('Number of samples in test set: {}'.format(test.shape[0]))

### Distribution of the Target

In [None]:
df = train.groupby(['target'])['id'].count()
df = df.reset_index()
df = df.rename(columns={'id': 'number of samples'})
fig, ax = plt.subplots(figsize=(4, 6))

sns.barplot(x='target', y='number of samples', data=df)

In [None]:
print('Number of disaster tweets: {}'.format(df.loc[df['target']==1]['number of samples'].values[0]))
print('Number of no disaster tweets: {}'.format(df.loc[df['target']==0]['number of samples'].values[0]))

So, the classes are kind of balanced but we have about 1000 more samples in no disaster tweets.

### Number of Characters in Tweets

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

tweet_len = train.loc[train['target']==1]['text'].str.len()
ax1.hist(tweet_len, color='blue')
ax1.set_title('disaster tweets')
tweet_len = train.loc[train['target']==0]['text'].str.len()
ax2.hist(tweet_len, color='red')
ax2.set_title('not disaster tweets')

fig.suptitle('Number of characters in tweets')

plt.show()

Both diaster and not disaster tweets have almost same distribution on number of characters. 120 to 140 characters are most common in both groups.

### Number of Words in Tweets

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
tweet_len = train.loc[train['target']==1]['text'].str.split().map(lambda x: len(x))
ax1.hist(tweet_len, color='blue')
ax1.set_title('disaster tweets')
tweet_len = train.loc[train['target']==0]['text'].str.split().map(lambda x: len(x))
ax2.hist(tweet_len, color='red')
ax2.set_title('not disaster tweets')

fig.suptitle('Number of words in tweets')

plt.show()

It seems not disaster tweets have slightly more words. This might be used as a feature!

### Average Word Length in Tweets

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

words = train.loc[train['target']==1] ['text'].str.split().apply(lambda x: [len(i) for i in x])
sns.distplot(words.map(lambda x: np.mean(x)), ax=ax1, color='blue')


words = train.loc[train['target']==0] ['text'].str.split().apply(lambda x: [len(i) for i in x])
sns.distplot(words.map(lambda x: np.mean(x)), ax=ax2, color='red')
ax2.set_title('not disaster tweets')

fig.suptitle('Average length of words in tweets')

plt.show()

Disaster and not disaster have almost same distribution over average length of words in them.

In [None]:
# funcion to make corpus
def make_corpus(df, target):
    corpus = []
    for x in df.loc[df['target']==target]['text'].str.split():x2
        for i in x:
            corpus.append(i)
    return corpus

### Stop Words

In [None]:
# make a corpus of not disaster tweets
not_disaster_corpus = make_corpus(train, target=0) # stop words for not disaster tweets
dic = defaultdict(int)

for word in not_disaster_corpus:
    if word in stop_words:
        dic[word] += 1
        
# top 10 stop words in not disaster tweets
not_disaster_top_stop_words = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]

# make a corpus of disaster tweets
disaster_corpus = make_corpus(train, target=1) # stop words for disaster tweets
dic = defaultdict(int)

for word in disaster_corpus:
    if word in stop_words:
        dic[word] += 1
        
# top 10 stop words in disaster tweets
disaster_top_stop_words = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]

In [None]:
x1, y1 = zip(*disaster_top_stop_words)
x2, y2 = zip(*not_disaster_top_stop_words)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

ax1.bar(x1, y1, color='blue')
ax1.set_title('disaster tweets')

ax2.bar(x2, y2, color='red')
ax2.set_title('not disaster tweets')

fig.suptitle('Top 10 stop words')

plt.show()

In both classes `the` is the most frequent stop word followed by `a` in class not disaster and `in` in class disaster.

### Punctuations

In [None]:
punctuations = string.punctuation

dic_punc_not_disaster = defaultdict(int)

for word in not_disaster_corpus:
    if word in punctuations:
        dic_punc_not_disaster[word] += 1
        
dic_punc_disaster = defaultdict(int)

for word in disaster_corpus:
    if word in punctuations:
        dic_punc_disaster[word] += 1

In [None]:
x1, y1 = zip(*dic_punc_disaster.items())
x2, y2 = zip(*dic_punc_not_disaster.items())

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.bar(x1, y1, color='blue')
ax1.set_title('disaster tweets')

ax2.bar(x2, y2, color='red')
ax2.set_title('not disaster tweets')

fig.suptitle('Number of Punctuations')
plt.show()

### Common Words

In [None]:
disaster_counter = Counter(disaster_corpus)
disaster_most_common_words = disaster_counter.most_common()

x1 = []
y1 = []

for word, count in disaster_most_common_words[:40]:
    if (word not in stop_words) & (word not in punctuations):
        x1.append(word)
        y1.append(count)
        
not_disaster_counter = Counter(not_disaster_corpus)
not_disaster_most_common_words = not_disaster_counter.most_common()

x2 = []
y2 = []

for word, count in not_disaster_most_common_words[:40]:
    if (word not in stop_words) & (word not in punctuations):
        x2.append(word)
        y2.append(count)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 5))

sns.barplot(x1, y1, ax=ax1)
ax1.set_title('disaster tweets')

sns.barplot(x2, y2, ax=ax2)
ax2.set_title('not disaster tweets')

fig.suptitle('Common Words in Tweets')
plt.show()

Lots of cleaning needed!

### Ngram Analysis

We analyze bigrams (n = 2) and trigrams (n = 3) over tweets. Let's check the most common ngrams in tweets. 