In [1]:
import os

In [2]:
os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [3]:
!kaggle competitions download -c quora-insincere-questions-classification -f train.csv -p dataset

train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
!kaggle competitions download -c quora-insincere-questions-classification -f test.csv -p dataset
!kaggle competitions download -c quora-insincere-questions-classification -f sample_submission.csv -p dataset

test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
sample_submission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
# !chmod 600 ./kaggle.json

### Data Exploration

In [6]:
import pandas as pd

In [7]:
train_fname = 'dataset/train.csv.zip'
test_fname = 'dataset/test.csv.zip'
submission_fname = 'dataset/sample_submission.csv.zip'

In [8]:
df = pd.read_csv(train_fname)
test_df = pd.read_csv(test_fname)
submission_df = pd.read_csv(submission_fname)

In [9]:
df

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0


## Text Preprocessing

### Tokenization

In [10]:
import nltk
from nltk.tokenize import word_tokenize

In [11]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Removing Stop Words

In [12]:
from nltk.corpus import stopwords

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
english_stopwords = stopwords.words('english')

In [15]:
", ".join(english_stopwords)

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

### Stemming

In [16]:
from nltk.stem.snowball import SnowballStemmer

In [17]:
stemmer = SnowballStemmer(language='english')

In [18]:
def tokenize(text):
  return [stemmer.stem(word) for word in word_tokenize(text) if word.lower() not in english_stopwords]

### Count Vectorizer

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
vectorizer = CountVectorizer(lowercase=True, tokenizer=tokenize, stop_words=english_stopwords, max_features=1000)

In [21]:
%%time
vectorizer.fit(df['question_text'])



CPU times: user 6min 20s, sys: 1.43 s, total: 6min 22s
Wall time: 6min 28s


In [22]:
vectorizer.get_feature_names_out()[:100]

array(['!', '$', '%', '&', "'", "''", "'m", "'s", '(', ')', ',', '-', '.',
       '1', '10', '100', '12', '12th', '15', '2', '20', '2017', '2018',
       '3', '4', '5', '6', '7', '8', ':', '?', '[', ']', '``', 'abl',
       'abroad', 'abus', 'accept', 'access', 'accomplish', 'accord',
       'account', 'achiev', 'acid', 'act', 'action', 'activ', 'actor',
       'actual', 'ad', 'add', 'address', 'admiss', 'adult', 'advanc',
       'advantag', 'advic', 'affect', 'africa', 'african', 'age', 'ago',
       'air', 'allow', 'almost', 'alon', 'alreadi', 'also', 'altern',
       'alway', 'amazon', 'america', 'american', 'among', 'amount',
       'analysi', 'android', 'anim', 'anoth', 'answer', 'anyon', 'anyth',
       'apart', 'app', 'appear', 'appl', 'appli', 'applic', 'approach',
       'arab', 'area', 'arm', 'armi', 'around', 'art', 'asian', 'ask',
       'associ', 'atheist', 'attack'], dtype=object)

In [23]:
%%time
inputs = vectorizer.transform(df['question_text'])

CPU times: user 6min 21s, sys: 1.17 s, total: 6min 22s
Wall time: 6min 26s


In [24]:
%%time
test_inputs = vectorizer.transform(test_df['question_text'])

CPU times: user 1min 47s, sys: 256 ms, total: 1min 48s
Wall time: 1min 48s


In [25]:
print(f"Train Input Shape: {inputs.shape}")
print(f"Test Input Shape: {test_inputs.shape}")

Train Input Shape: (1306122, 1000)
Test Input Shape: (375806, 1000)


## Machine Learning for Text Classification

### Create training and validation set

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, df['target'], test_size=0.3, random_state=42)

In [28]:
print(f"Train Input Shape: {train_inputs.shape}")
print(f"Validation Input Shape: {val_inputs.shape}")
print(f"Train Target Shape: {train_targets.shape}")
print(f"Validation Target Shape: {val_targets.shape}")

Train Input Shape: (914285, 1000)
Validation Input Shape: (391837, 1000)
Train Target Shape: (914285,)
Validation Target Shape: (391837,)


### Logistic Regression Model

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
model = LogisticRegression(max_iter=1000, solver='sag')

In [31]:
model.fit(train_inputs, train_targets)

In [32]:
train_preds = model.predict(train_inputs)

In [33]:
val_preds= model.predict(val_inputs)

### Accuracy and F1 Score

In [34]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [35]:
print(f"Training Accuracy {accuracy_score(train_targets, train_preds)}")
print(f"Training F1 Score {f1_score(train_targets, train_preds)}")

Training Accuracy 0.9454218323608066
Training F1 Score 0.3857856773590015


In [36]:
print(f"Validation Accuracy {accuracy_score(val_targets, val_preds)}")
print(f"Validation F1 Score {f1_score(val_targets, val_preds)}")

Validation Accuracy 0.9462301926566404
Validation F1 Score 0.38378520663332455


### Making Predictions for Kaggle

In [37]:
test_df

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?
...,...,...
375801,ffff7fa746bd6d6197a9,How many countries listed in gold import in in...
375802,ffffa1be31c43046ab6b,Is there an alternative to dresses on formal p...
375803,ffffae173b6ca6bfa563,Where I can find best friendship quotes in Tel...
375804,ffffb1f7f1a008620287,What are the causes of refraction of light?


In [38]:
test_preds = model.predict(test_inputs)

In [39]:
submission_df

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,0
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0
...,...,...
375801,ffff7fa746bd6d6197a9,0
375802,ffffa1be31c43046ab6b,0
375803,ffffae173b6ca6bfa563,0
375804,ffffb1f7f1a008620287,0


In [40]:
submission_df['prediction'] = test_preds

In [41]:
submission_df['prediction'].value_counts()

0    365940
1      9866
Name: prediction, dtype: int64

In [42]:
submission_df.to_csv('submission.csv', index=None)

### The End