In [276]:
import pandas as pd
import seaborn as sns
import re
from pathlib import Path
import matplotlib.pyplot as plt

import numpy as np

sns.set()

BASE_PATH = Path("../input/nlp-getting-started/")

## Loading and getting basic idea

In [277]:
train = pd.read_csv(BASE_PATH/'train.csv')
test = pd.read_csv(BASE_PATH/'test.csv')

In [278]:
train.head(4)

In [279]:
test.head(4)

In [280]:
print(f"There are {train.shape[0]} rows and {train.shape[1]} columns in the training set")
print(f"There are {test.shape[0]} rows and {test.shape[1]} columns in the testing set")

### Checking Class Distributions 

In [281]:
train['target'].value_counts().plot.\
                bar(title = 'Number of samples', color = ['r', 'g']).\
                set_ylabel("samples")
plt.xticks(rotation = 0)
plt.show()

#### Checking Missing Values

In [282]:
ax = train.isna().sum().plot.barh(title = 'Missing Value Count', color='r')
ax.set_xlabel("Count")

for container in ax.containers:
    ax.bar_label(container)

#### Number of words

In [283]:
DISASTER_TWEETS = train['target'] == 1

In [284]:
def histplot(column, *args, **kwarg):
    fig, ax = plt.subplots(figsize = (8, 5))
    sns.histplot(train[~DISASTER_TWEETS][column], ax=ax, color='green', label=args[0], **kwarg)
    sns.histplot(train[DISASTER_TWEETS][column], ax=ax, color='red', label=args[1], **kwarg)
    ax.legend()
    ax.set_title(args[2])
    plt.show()

In [285]:
train['word_count'] = train['text'].map(lambda x: len(x.split()))

histplot('word_count', 'Normal Tweet', 
         'Disaster Tweet', 
         'Word Count Target Distribution', 
          kde = True, stat = 'density')

#### Number of Character

In [286]:
train['char_count'] = train['text'].map(len)

histplot('char_count', 'Normal Tweet', 
         'Disaster Tweet', 
         'Char Count Target Distribution', 
          kde = True, stat = 'density')

#### Unique Word Count

In [287]:
train['unique_word_count'] = train['text'].map(lambda x: len(set(x.split())))

histplot('unique_word_count', 'Normal Tweet', 
         'Disaster Tweet', 
         'Unique Word Count Target Distribution', 
          kde = True, stat = 'density')

#### Average Word length

In [288]:
train['average_word_length'] = train['text'].map(lambda x: np.mean([len(word) for word in x.split()]))

histplot('average_word_length', 'Normal Tweet', 
         'Disaster Tweet', 
         'Average Word length Target Distribution', 
          kde = True, stat = 'density')

#### Ngrams Analysing

In [289]:
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS

top_10000 = pd.read_csv("https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt", header=None)

In [290]:
def ngram_counter(ngram, text, top=5):
    vector = CountVectorizer(ngram_range=ngram, stop_words=set(top_10000[0].values))
    vector_text = vector.fit_transform(text)
    total_words = vector_text.sum(axis=0).A[0]
    sorted_total = total_words.argsort()[::-1]
    feature_name = vector.get_feature_names()
    ngrams = pd.DataFrame(
                    [(feature_name[sorted_total[i]], total_words[sorted_total[i]])
                      for i in range(0, top)
                    ], 
                    columns=['keywords', 'count'])
    return ngrams
    

In [291]:
top_10_unigrams_disaster = ngram_counter((1,1), train[DISASTER_TWEETS]['text'], 10)
top_10_unigrams_normal = ngram_counter((1,1), train[~DISASTER_TWEETS]['text'], 10)

top_20_bigrams_disaster = ngram_counter((2,2), train[DISASTER_TWEETS]['text'], 20)
top_20_bigrams_normal = ngram_counter((2,2), train[~DISASTER_TWEETS]['text'], 20)

In [292]:
fig, (ax,ax1) = plt.subplots(1, 2, figsize = (18, 7))
sns.barplot(data=top_20_bigrams_disaster, y='keywords', x='count', orient='h', ax=ax)
sns.barplot(data=top_20_bigrams_normal, y='keywords', x='count', orient='h', ax=ax1)
ax.set_title("Top 10 Bigrams in Disaster Tweet")
ax1.set_title("Top 10 Bigrams in Normal Tweet")

In [293]:
fig, (ax,ax1) = plt.subplots(1, 2, figsize = (18, 7))
sns.barplot(data=top_10_unigrams_disaster, y='keywords', x='count', orient='h', ax=ax)
sns.barplot(data=top_10_unigrams_normal, y='keywords', x='count', orient='h', ax=ax1)
ax.set_title("Top 10 Unigrams in Disaster Tweet")
ax1.set_title("Top 10 Unigrams in Normal Tweet")

#### Analysing Keyword & Location columns

In [294]:
print(f"Sample Keywords are: {train['keyword'].sample(4).values}")
print(f"Number of Unique Keywords: {train['keyword'].nunique()}")

test['keyword'].nunique(), train['keyword'].nunique()

In [295]:
print(f"Sample Locations are: {train['location'].sample(4).values}")
print(f"Number of Unique Locations: {train['location'].nunique()}")

In [296]:
train['target_mean'] = train.groupby('keyword')['target'].transform('mean')

fig = plt.figure(figsize = (8, 80), dpi = 100)

sns.countplot(y=train.sort_values(by='target_mean', ascending = False)['keyword'],
              hue=train.sort_values(by='target_mean', ascending = False)['target'],
              )

plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc=1)
plt.title('Target Distribution in Keywords')

plt.show()

## Cleaning and processing data

In [311]:
!pip install textacy

import html
import re
from textacy import preprocessing as tprep

pd.set_option('display.max_colwidth', 50)


In [298]:
def clean(text):

    text = re.sub(r"\x89Û_", "", text)
    text = re.sub(r"\x89ÛÒ", "", text)
    text = re.sub(r"\x89ÛÓ", "", text)
    
    text = tprep.replace.urls(text, '')
    text = tprep.replace.numbers(text, '')
    text = tprep.replace.user_handles(text, '')
    text = tprep.replace.emojis(text, '')
    text = tprep.replace.numbers(text, '')
    text = tprep.replace.currency_symbols(text, '')
    
    text = tprep.remove.accents(text)
    text = tprep.remove.punctuation(text)
    text = tprep.remove.html_tags(text)
    text = tprep.remove.brackets(text)
    
    text = tprep.normalize.hyphenated_words(text)
    text = tprep.normalize.quotation_marks(text)
    text = tprep.normalize.unicode(text)
    
    text = tprep.normalize.whitespace(text)
    
    return text.lower()


In [299]:
train['keyword'].fillna("", inplace = True)
test['keyword'].fillna("", inplace = True)

train['text'] = train['keyword'] + ' ' + train['text']
test['text'] = test['keyword'] + ' ' + test['text']

In [300]:
train['text'] = train['text'].map(clean)
test['text'] = test['text'].map(clean)

In [301]:
train.columns

In [302]:
train.drop(['location', 'word_count', 'char_count', 'unique_word_count', 'average_word_length', 'target_mean'], axis=1, inplace = True)

In [303]:
train.sample(5)

### Model

In [304]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression



In [305]:
X_train, X_test, Y_train, Y_test = train_test_split(train['text'], train['target'],
                                   test_size = .2, random_state = 42,
                                   stratify  = train['target'])

In [306]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words=set(top_10000[0].values), min_df=4)

X_train_tf = vectorizer.fit_transform(X_train)

X_test_tf = vectorizer.transform(X_test)

In [307]:
svc = SVC(kernel="linear", tol=1e-5, C=.9, random_state=42)

svc.fit(X_train_tf, Y_train)

Y_pred_svc = svc.predict(X_test_tf)
print(classification_report(Y_test, Y_pred_svc))

In [308]:
naive = MultinomialNB(alpha=1.0)

naive.fit(X_train_tf, Y_train)

Y_pred_naive = naive.predict(X_test_tf)
print(classification_report(Y_test, Y_pred_naive))

In [309]:
sample_sub=pd.read_csv(BASE_PATH/'sample_submission.csv')

sample_sub.sample(4)

In [310]:
test_tf = vectorizer.transform(test['text'])
test_pred = svc.predict(test_tf)

sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':test_pred})
sub.to_csv('submission.csv',index=False)