# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import pickle
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Reading Data From CSV File

In [2]:
data = pd.read_csv('flipkart.csv', low_memory=False, encoding='utf-8')

# Creating a NEW dataframe from acutal dataframe by selecting specfic columns

In [3]:
df = pd.DataFrame(data[['description','brand']])
df

Unnamed: 0,description,brand
0,Key Features of Alisha Solid Women's Cycling S...,Alisha
1,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,FabHomeDecor
2,Key Features of AW Bellies Sandals Wedges Heel...,AW
3,Key Features of Alisha Solid Women's Cycling S...,Alisha
4,Specifications of Sicons All Purpose Arnica Do...,Sicons
...,...,...
19995,Buy WallDesign Small Vinyl Sticker for Rs.730 ...,WallDesign
19996,Buy Wallmantra Large Vinyl Stickers Sticker fo...,Wallmantra
19997,Buy Elite Collection Medium Acrylic Sticker fo...,Elite Collection
19998,Buy Elite Collection Medium Acrylic Sticker fo...,Elite Collection


# Cleaning Messy Column Names

In [4]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
df.columns

Index(['description', 'brand'], dtype='object')

# Removing Special Characters from Dataframe

In [5]:
# Replacing dataFrame in another Variable.
df_spsl = df

# Declaration all Sorts of Special Characters in the form of List.
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–", "//", "%*", ":/", ".;", "Ø", "§"]

# Checking
for c in spec_chars:
    c1 = "\\" + c
    # Replacing With SPACE"".
    df_spsl=df_spsl.replace(c1,"", regex=True)

# Replacing DataFrame with no Special characters.
df1 = df_spsl
display(df1)

Unnamed: 0,description,brand
0,Key Features of Alisha Solid Womens Cycling Sh...,Alisha
1,FabHomeDecor Fabric Double Sofa Bed Finish Col...,FabHomeDecor
2,Key Features of AW Bellies Sandals Wedges Heel...,AW
3,Key Features of Alisha Solid Womens Cycling Sh...,Alisha
4,Specifications of Sicons All Purpose Arnica Do...,Sicons
...,...,...
19995,Buy WallDesign Small Vinyl Sticker for Rs730 o...,WallDesign
19996,Buy Wallmantra Large Vinyl Stickers Sticker fo...,Wallmantra
19997,Buy Elite Collection Medium Acrylic Sticker fo...,Elite Collection
19998,Buy Elite Collection Medium Acrylic Sticker fo...,Elite Collection


# Creating a custom class to Impute Null values using TransformerMixin

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

df = DataFrameImputer().fit_transform(df1)
display(df)    

Unnamed: 0,description,brand
0,Key Features of Alisha Solid Womens Cycling Sh...,Alisha
1,FabHomeDecor Fabric Double Sofa Bed Finish Col...,FabHomeDecor
2,Key Features of AW Bellies Sandals Wedges Heel...,AW
3,Key Features of Alisha Solid Womens Cycling Sh...,Alisha
4,Specifications of Sicons All Purpose Arnica Do...,Sicons
...,...,...
19995,Buy WallDesign Small Vinyl Sticker for Rs730 o...,WallDesign
19996,Buy Wallmantra Large Vinyl Stickers Sticker fo...,Wallmantra
19997,Buy Elite Collection Medium Acrylic Sticker fo...,Elite Collection
19998,Buy Elite Collection Medium Acrylic Sticker fo...,Elite Collection


# Sepereating Fetures as X and y

In [7]:
X, y = df.description, df.brand

# Text Preprocessing

In [8]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)
    

# Bag of Words (BoW)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
# Load the text data
corpus = documents

vectorizer = CountVectorizer(max_features=10, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
docs       = vectorizer.fit_transform(corpus)

# Creating Bag Of Words DataFrame 
BOW = pd.DataFrame(docs.A, columns=vectorizer.get_feature_names()).to_string()
display(BOW)


'       buy  day  delivery  flipkartcom  free  genuine  online  price  product  woman\n0        0    0         0            0     0        0       0      0        0      3\n1        0    0         2            0     1        0       0      1       17      0\n2        0    1         0            0     0        0       0      1        1      1\n3        0    0         0            0     0        0       0      0        0      3\n4        0    0         0            0     0        0       0      0        0      0\n5        0    0         0            0     0        0       0      1        3      0\n6        0    0         0            0     0        0       0      0        0      3\n7        0    0         2            0     1        0       0      1       17      0\n8        0    0         0            0     0        0       0      0        1      1\n9        0    0         0            0     0        0       0      0        0      3\n10       0    0         0            0     0        0

# Finding TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from yellowbrick.text import TSNEVisualizer

# Load the data and create document vectors
corpus = documents
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(corpus)

# Create the visualizer and draw the vectors
tsne = TSNEVisualizer()
tsne.fit(X, y)
tsne.show()

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
X

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))