In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample_submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
sample_submission

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train['text'].sample(30)

**Text processing require:**  
- remove punctuation  
- remove numbers  
- to lowercase  
- remove stopwords  
- lemmatize  
- remove non-english words  

**NOTE** : The order of the steps matter. The text processing function should execute steps in this outlined order.

In [88]:
# A function to process text as outlined above

import string
import nltk
from nltk.corpus import stopwords

words = set(nltk.corpus.words.words())

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def text_process(text):
    no_punc = [c for c in text if c not in string.punctuation]
    no_punc = ''.join(no_punc)
    no_dig = [c for c in no_punc if not c.isdigit()]
    no_dig = ''.join(no_dig)
    lcase = no_dig.lower()
    no_stop = [word for word in lcase.split() if word not in stopwords.words('english')]
    no_stop = ' '.join(no_stop)
    lemmatized = [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(no_stop)]
    lemmatized = ' '.join(lemmatized)
    eng = [word for word in nltk.wordpunct_tokenize(lemmatized) if word in words or not word.isalpha()]
    return eng

In [89]:
from sklearn.feature_extraction.text import CountVectorizer

bow_transformer = CountVectorizer(analyzer=text_process)
bow_transformer.fit(train['text'])

CountVectorizer(analyzer=<function text_process at 0x7f817acd5730>,
                binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [92]:
len(bow_transformer.vocabulary_)

6702

In [98]:
print(bow_transformer.transform([train['text'][1230]]))

  (0, 325)	1
  (0, 1048)	1
  (0, 2099)	1
  (0, 2248)	1
  (0, 2698)	1
  (0, 3017)	1


In [99]:
bow_transformer.get_feature_names()[1048]

'cladding'

In [93]:
bow_text = bow_transformer.transform(train['text'])

In [104]:
bow_text.shape

(7613, 6702)

In [94]:
#tfidf

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(bow_text)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [100]:
print(tfidf_transformer.transform(bow_transformer.transform([train['text'][1230]])))

  (0, 3017)	0.4700906051478796
  (0, 2698)	0.34633250839624347
  (0, 2248)	0.22818049172662036
  (0, 2099)	0.43932738517349695
  (0, 1048)	0.4700906051478796
  (0, 325)	0.43932738517349695


In [102]:
tfidf_transformer.idf_[bow_transformer.vocabulary_['cladding']]

8.551449575822552

In [95]:
tfidf_text = tfidf_transformer.transform(bow_text)

In [96]:
tfidf_text.shape

(7613, 6702)

**ML Algorithm Selection**

In [106]:
train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [107]:
# So, a classification algorithm eg. Logistic regression, naive bayes

In [109]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [112]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_text, train['target'], test_size=0.3, random_state=101)

In [118]:
from sklearn.metrics import confusion_matrix, classification_report

In [114]:
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)

rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

In [119]:
print("---------------------------------")
print("Logistic Regression")
print("---------------------------------")
print(confusion_matrix(y_test, lr_pred))
print(classification_report(y_test, lr_pred))
print("---------------------------------")
print("Naive Bayes MultinomialNB")
print("---------------------------------")
print(confusion_matrix(y_test, nb_pred))
print(classification_report(y_test, nb_pred))
print("---------------------------------")
print("RandomForestClassifier")
print("---------------------------------")
print(confusion_matrix(y_test, rfc_pred))
print(classification_report(y_test, rfc_pred))

---------------------------------
Logistic Regression
---------------------------------
[[1184  146]
 [ 333  621]]
              precision    recall  f1-score   support

           0       0.78      0.89      0.83      1330
           1       0.81      0.65      0.72       954

    accuracy                           0.79      2284
   macro avg       0.80      0.77      0.78      2284
weighted avg       0.79      0.79      0.79      2284

---------------------------------
Naive Bayes MultinomialNB
---------------------------------
[[1154  176]
 [ 328  626]]
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      1330
           1       0.78      0.66      0.71       954

    accuracy                           0.78      2284
   macro avg       0.78      0.76      0.77      2284
weighted avg       0.78      0.78      0.78      2284

---------------------------------
RandomForestClassifier
---------------------------------
[[1139  191]
 [ 328

In [120]:
# Logistic regression did quite well than the other two

**Building a data pipeline to predict on the test data.**

In [129]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("bow cv", CountVectorizer(analyzer=text_process)),
    ("tfidf", TfidfTransformer()),
    ("lr classifier", LogisticRegression())
])

In [132]:
text_train, text_test, label_train, label_test = train_test_split(train['text'], train['target'],
                                                                  test_size=0.3, random_state=101) 

In [133]:
pipe.fit(text_train, label_train)

Pipeline(memory=None,
         steps=[('bow cv',
                 CountVectorizer(analyzer=<function text_process at 0x7f817acd5730>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('lr classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                        

In [135]:
pipe_pred = pipe.predict(test['text'])

In [137]:
pipe_pred.shape

(3263,)

In [138]:
test.shape

(3263, 4)

In [141]:
submission_df = pd.DataFrame({
    "id": test['id'],
    "target": pipe_pred
})

In [139]:
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [142]:
submission_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [144]:
submission_df.to_csv('submission_df.csv', index=False)