In [34]:
# import necessary libraries 

import pandas as pd 
import numpy as np 
import re, string 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer


In [9]:
train = pd.read_csv('../datasets/train.csv', index_col='id')
test = pd.read_csv('../datasets/test.csv', index_col='id')
valid = pd.read_csv('../datasets/valid.csv', index_col='id')

In [14]:
train.head()

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Sam has an everlast treat each nite before bed...,dogs
1,The product is as it says. I keep an eye on it...,dogs
2,My Kitty thinks these are treats! He loves the...,dogs
3,This is the third or fourth time that we've or...,dogs
4,Put this on both my dogs. And they are scratch...,dogs


In [15]:
valid.head()

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,We started giving this to my 5 year old Labrad...,dogs
1,This product is easy to set up and use. I have...,dogs
2,my dog has nerves and wants to itch and chew.....,dogs
3,the breeded gave us a can of Nupro when we too...,dogs
4,"Very study, well made poop bag. Easy to open a...",dogs


In [12]:
test.head()

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
0,Great Filter. I have always used Marineland fi...
1,This is a great item. My elderly dog loves it....
2,"I have a shep/akita mix, but I bought this mor..."
3,I'm always amazed by what is available on Amaz...
4,I have 2 Abyssinians who will use any litterbo...


## Text Processing 

In [13]:
class TextProcessor: 
    
    def __init__(self) -> None:
        self.stopwordsAll = ["and", "or"] # for demonstration 
    
    
    def preprocess(self,text):
        """String Processor 
        
        - Strip the words 
        - Remove Punctuations 
        - Remove numbers
         
        """

        text = text.lower() 
        text = text.strip()
        text = re.compile('<.*?>').sub(' ', text)
        text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
        text = re.sub('\s+', ' ', text)
        text = re.sub(r'\[[0-9]*\]', ' ', text)
        text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
        text = re.sub(r'\d', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text


    
    def stopword(self,string):
        
        a = [i for i in string.split() if i not in self.stopwordsAll]
        return ' '.join(a)


    def finalStep(self, string): 
        
        return self.stopword(self.preprocess(string))
    

In [16]:
combined_df = pd.concat([train, valid])

In [21]:
t = TextProcessor()

combined_df["processed_text"] = combined_df["text"].apply(lambda x : t.finalStep(x))
test["processed_text"] = test["text"].apply(lambda x : t.finalStep(x))

In [23]:
combined_df[["text","processed_text"]].head(3)

Unnamed: 0_level_0,text,processed_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Sam has an everlast treat each nite before bed...,sam has an everlast treat each nite before bed...
1,The product is as it says. I keep an eye on it...,the product is as it says i keep an eye on it ...
2,My Kitty thinks these are treats! He loves the...,my kitty thinks these are treats he loves them...


## Vectorizing

In [29]:
text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=200000) # max_features decides how many columns you will have 

In [30]:
X_train_text = text_transformer.fit_transform(combined_df['processed_text'])
X_test_text = text_transformer.transform(test['processed_text'])

In [31]:
X_train_text.shape, X_test_text.shape

((69410, 200000), (17353, 200000))

## Modelling 

In [38]:
model = LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=41, n_jobs=-1) 

In [35]:
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=41)

In [36]:
cv_results = cross_val_score(model, X_train_text, combined_df['label'], cv=skf, scoring='f1_micro')

In [37]:
cv_results # This gives us a general idea about model 

array([0.83424579, 0.84087307, 0.83727129, 0.83748739, 0.84325025])

In [39]:
model.fit(X_train_text, combined_df['label'])

LogisticRegression(C=50.0, multi_class='multinomial', n_jobs=-1,
                   random_state=41)

In [40]:
predictions = model.predict(X_test_text)

In [43]:
predictions[:15] # some of the predictions for the test data 

array(['fish aquatic pets', 'dogs', 'dogs', 'dogs', 'cats', 'dogs',
       'dogs', 'cats', 'dogs', 'cats', 'dogs', 'cats', 'cats', 'cats',
       'dogs'], dtype=object)