In [1]:
import pandas as pd
import numpy as np
import time

import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

In [None]:
dataPath = 'nlp-getting-started/train.csv'
data = pd.read_csv(dataPath, index_col=0)

In [None]:
data.info()

### Count

In [None]:
data.count()

### Nan

In [None]:
data.isna().sum()

In [None]:
dataNonNull = data.dropna()

In [None]:
print(f'{"Columns":20}: {"All":10} {"NonNull":10} {"%NonNull":10} {"Difference"}')
for idx, col in enumerate(data.columns):
    allValue = data.count()[idx]
    nonNullValue = dataNonNull.count()[idx]
    per = nonNullValue*100/allValue
    diff = allValue - nonNullValue
    print(f'{col:20}: {allValue} {nonNullValue:10} {np.round(per):10} {diff:10}')

In [None]:
data['keyword'].fillna("", inplace=True)
data['location'].fillna("", inplace=True)

## Target

In [None]:
columnName = 'target'

#----------------------
def getCategoricalColumn(value):
    if value == 1: return "Disaster"
    else: return "Not disaster"
    
CategoricalColumn = data[columnName].apply(getCategoricalColumn)
CategoricalColumn.name = 'catTarget'

df = pd.concat([data, CategoricalColumn], axis=1)
#----------------------

groups = []
for group, subset in df.groupby(by=CategoricalColumn.name):
    groups.append({
        CategoricalColumn.name: group,
        'Count': len(subset)
    })

lenData = data[columnName].count()

dataCategoricalQuality = pd.DataFrame(groups)

fig, ax = plt.subplots(figsize=(4, 4))

dataCategoricalQuality.plot.bar(x=CategoricalColumn.name, ax=ax)

for i in range(len(groups)):
    value = str(groups[i]['Count'])+': '+str(np.round(groups[i]['Count']*100/lenData))+'%'
    ax.text(i, groups[i]['Count'], value , horizontalalignment='center', 
            verticalalignment='bottom')

ax.set_ylim(0, lenData - lenData/5)

ax.set_xlabel('target')
ax.set_ylabel('Count')
ax.set_title('Sum: '+ str(lenData) )
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

### target vs keyword

In [None]:
data['keyword'] = data['keyword'].str.replace('%20', ' ')

In [None]:
columnNameA = 'target'
columnNameB = 'keyword'

crossTable = pd.crosstab(index=data[columnNameB],
                         columns=data[columnNameA],
                         margins=True)

crossTable.rename(columns={0 : 'Not disaster',1 : 'Disaster',}, inplace=True)

In [None]:
print('Most frequent Keywords for Disaster')
crossTable.sort_values(by='Disaster', ascending=False).head(10)

In [None]:
print('Most frequent Keywords for Not disaster')
crossTable.sort_values(by='Not disaster', ascending=False).head(10)

#### target vs keyword length (character)

In [None]:
data['keywordLengthChar'] = data['keyword'].apply(len)

In [None]:
columnNameA = 'target'
columnNameB = 'keywordLengthChar'

sns.boxplot(data=data, x=columnNameA, y=columnNameB)


plt.tight_layout()
plt.show()

In [None]:
columnNameA = 'target'
columnNameB = 'keywordLengthChar'

g = sns.FacetGrid(data, col=columnNameA)
g.map(sns.histplot, columnNameB, bins=30)

plt.tight_layout()
plt.show()

We observe that keyword lengths are longer in Not Disaster than in Disaster

### target vs location

In [None]:
columnNameA = 'target'
columnNameB = 'location'

crossTable = pd.crosstab(index=data[columnNameB],
                         columns=data[columnNameA],
                         margins=True)

crossTable.rename(columns={0 : 'Not disaster',1 : 'Disaster',}, inplace=True)

In [None]:
print('Most frequent Locations for Disaster')
crossTable.sort_values(by='Disaster', ascending=False).head(10)

In [None]:
print('Most frequent Locations for Not disaster')
crossTable.sort_values(by='Not disaster', ascending=False).head(10)

### target vs text

#### Number of character (including space)

In [None]:
data['textLengthChar'] = data['text'].apply(len)

In [None]:
columnName = 'textLengthChar'

ax = (data[columnName]).plot.box(figsize=(3, 4))
ax.set_ylabel(columnName)

plt.tight_layout()
plt.show()

In [None]:
columnNameA = 'target'
columnNameB = 'textLengthChar'

g = sns.FacetGrid(data, col=columnNameA)
g.map(sns.histplot, columnNameB, bins=30)

plt.tight_layout()
plt.show()

We observe that text lengths are longer in Not Disaster than in Disaster

In [None]:
columnNameA = 'target'
columnNameB = 'textLengthChar'

sns.boxplot(data=data, x=columnNameA, y=columnNameB)

plt.tight_layout()
plt.show()

#### Number of words

In [None]:
def getWordTextLength(text):
    return len(text.split())
data['textLengthWord'] = data['text'].apply(getWordTextLength)

In [None]:
columnName = 'textLengthWord'

ax = (data[columnName]).plot.box(figsize=(3, 4))
ax.set_ylabel(columnName)

plt.tight_layout()
plt.show()

In [None]:
columnNameA = 'target'
columnNameB = 'textLengthWord'

g = sns.FacetGrid(data, col=columnNameA)
g.map(sns.histplot, columnNameB, bins=30)

plt.tight_layout()
plt.show()

We observe that number of words in text are longer in Not Disaster than in Disaster

In [None]:
columnNameA = 'target'
columnNameB = 'textLengthWord'

sns.boxplot(data=data, x=columnNameA, y=columnNameB)

plt.tight_layout()
plt.show()

#### Link in text

In [None]:
pattern = r'(https?://\S+)'
data['link']= data["text"].str.extract(pattern)
    
data['containLink'] = data['link'].notna()

In [None]:
columnNameA = 'target'
columnNameB = 'containLink'

crossTable = pd.crosstab(index=data[columnNameB],
                         columns=data[columnNameA],
                         margins=True)

crossTable.rename(columns={0 : 'Not disaster',1 : 'Disaster',}, inplace=True)
crossTable.rename(index={False : 'No link',True : 'Link',}, inplace=True)

crossTable['Not disaster %'] = crossTable['Not disaster'] * 100 / crossTable['All']
crossTable['Disaster %'] = crossTable['Disaster'] * 100 / crossTable['All']

crossTable

We observe that there are more links in Disaster than Not Disaster.

# Predict and write file

#### For Pyspark

In [None]:
predictionTest = algorithm.fit(trainDataPreprocessed).transform(testDataPreprocessed)
predLabelTest = np.array(predictionTest.select('prediction').collect()).squeeze()

#### For other algorithms

##### LightGBM

In [None]:
predictionTest = modelLgb.predict(testPath, num_iteration=modelLgb.best_iteration)
predLabelTest = np.array([1 if x >= 0.5 else 0 for x in predictionTest])

#### Upload to kaggle

In [None]:
submission = pd.read_csv('nlp-getting-started/sample_submission.csv')
submission['target'] = submission['target'] + predLabelTest.astype(int)
submission.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c nlp-getting-started -f submission.csv -m ""