In [315]:
import pandas as pd
import seaborn as sns
import re
from pathlib import Path
import matplotlib.pyplot as plt

sns.set()

BASE_PATH = Path("../input/nlp-getting-started/")

## Loading and getting basic idea

In [316]:
train = pd.read_csv(BASE_PATH/'train.csv')
test = pd.read_csv(BASE_PATH/'test.csv')

In [317]:
train.head(4)

In [318]:
test.head(4)

In [319]:
print(f"There are {train.shape[0]} rows and {train.shape[1]} columns in the training set")
print(f"There are {test.shape[0]} rows and {test.shape[1]} columns in the testing set")

### Checking Class Distributions 

In [352]:
train['target'].value_counts().plot.\
                bar(title = 'Number of samples', color = ['r', 'g']).\
                set_ylabel("samples")
plt.xticks(rotation = 0)
plt.show()

#### Checking Missing Values

In [610]:
ax = train.isna().sum().plot.barh(title = 'Missing Value Count', color='r')
ax.set_xlabel("Count")

for container in ax.containers:
    ax.bar_label(container)

#### Number of words

In [321]:
DISASTER_TWEETS = train['target'] == 1

In [322]:
def histplot(column, *args, **kwarg):
    fig, ax = plt.subplots(figsize = (8, 5))
    sns.histplot(train[~DISASTER_TWEETS][column], ax=ax, color='green', label=args[0], **kwarg)
    sns.histplot(train[DISASTER_TWEETS][column], ax=ax, color='red', label=args[1], **kwarg)
    ax.legend()
    ax.set_title(args[2])
    plt.show()

In [323]:
train['word_count'] = train['text'].map(lambda x: len(x.split()))

histplot('word_count', 'Normal Tweet', 
         'Disaster Tweet', 
         'Word Count Target Distribution', 
          kde = True, stat = 'density')

#### Number of Character

In [325]:
train['char_count'] = train['text'].map(len)

histplot('char_count', 'Normal Tweet', 
         'Disaster Tweet', 
         'Char Count Target Distribution', 
          kde = True, stat = 'density')

#### Unique Word Count

In [328]:
train['unique_word_count'] = train['text'].map(lambda x: len(set(x.split())))

histplot('unique_word_count', 'Normal Tweet', 
         'Disaster Tweet', 
         'Unique Word Count Target Distribution', 
          kde = True, stat = 'density')

#### Average Word length

In [332]:
train['average_word_length'] = train['text'].map(lambda x: np.mean([len(word) for word in x.split()]))

histplot('average_word_length', 'Normal Tweet', 
         'Disaster Tweet', 
         'Average Word length Target Distribution', 
          kde = True, stat = 'density')

#### Ngrams Analysing

In [599]:
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS

In [600]:
def ngram_counter(ngram, text, top=5):
    vector = CountVectorizer(ngram_range=ngram, stop_words=STOP_WORDS)
    vector_text = vector.fit_transform(text)
    total_words = vector_text.sum(axis=0).A[0]
    sorted_total = total_words.argsort()[::-1]
    ngrams = pd.DataFrame(
                    [(feature_name[sorted_total[i]], total_words[sorted_total[i]])
                      for i in range(0, top)
                    ], 
                    columns=['keywords', 'count'])
    return ngrams
    

In [604]:
top_10_unigrams_disaster = ngram_counter((1,1), train[DISASTER_TWEETS]['text'], 10)
top_10_unigrams_normal = ngram_counter((1,1), train[~DISASTER_TWEETS]['text'], 10)

top_10_bigrams_disaster = ngram_counter((2,2), train[DISASTER_TWEETS]['text'], 20)
top_10_bigrams_normal = ngram_counter((2,2), train[~DISASTER_TWEETS]['text'], 20)

In [605]:
fig, (ax,ax1) = plt.subplots(1, 2, figsize = (18, 7))
sns.barplot(data=top_10_bigrams_disaster, y='keywords', x='count', orient='h', ax=ax)
sns.barplot(data=top_10_bigrams_normal, y='keywords', x='count', orient='h', ax=ax1)
ax1.set_title("Top 10 Bigrams in Disaster Tweet")
ax1.set_title("Top 10 Bigrams in Normal Tweet")

#### Analysing Keyword & Location columns

In [607]:
print(f"Sample Keywords are: {train['keyword'].sample(4).values}")
print(f"Number of Unique Keywords: {train['keyword'].nunique()}")

In [608]:
print(f"Sample Locations are: {train['location'].sample(4).values}")
print(f"Number of Unique Locations: {train['location'].nunique()}")

In [609]:
train['target_mean'] = train.groupby('keyword')['target'].transform('mean')

fig = plt.figure(figsize = (8, 80), dpi = 100)

sns.countplot(y=train.sort_values(by='target_mean', ascending = False)['keyword'],
              hue=train.sort_values(by='target_mean', ascending = False)['target'],
              )

plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc=1)
plt.title('Target Distribution in Keywords')

plt.show()