# NLP

## Import necessary libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer 

## Import Dataset

In [2]:
# Reading the csv files
fashion_df = pd.read_csv("./datasets/fashion_text_dataset.csv")

In [24]:
# Showing the first few columns of the dataset
fashion_df.head(10)

Unnamed: 0,description,category
0,Sless Flippy Mini Dress,women dresses
1,Midi Pleated Skirt,women skirts
2,Stamos Mix Silk Raglan Knit Sweater,men outwear
3,Basic Training Tank Top,men tops
4,Fashion Camouflage Hooded Jacket,women outwear
5,Straight Leg Sweatpants,men trousers
6,Hooded Pull Over,men outwear
7,Ally 3/4 Spring Pullover,women outwear
8,Cropped Pleated Pants,women trousers
9,NB Athletics Archive Run Pants,men trousers


In [4]:
fashion_df.count()

description    2000
category       2000
dtype: int64

# Information of the dataset

## Create a bag of words

In [5]:
docs = map(''.join, fashion_df[['description']].values.tolist())

cv = CountVectorizer(stop_words="english", 
                        analyzer='word', 
                        ngram_range=(1, 1), 
                        max_df=1.0, min_df=1, 
                        max_features=None)


# Generates word count for the words in the docs
word_count_vector = cv.fit_transform(docs)
print(word_count_vector.shape)

(2000, 1538)


## Get more insights from IDF & TFIDF

In [6]:
# ---  IDF  ---
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
 
# idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf"])
 
# sort ascending
df_idf = df_idf.sort_values(by=['idf'])
df_idf.head(10)

Unnamed: 0,idf
shorts,3.060139
skirt,3.24011
dress,3.273526
shirt,3.313135
sleeve,3.624669
pants,3.681421
jacket,3.757215
midi,3.957011
fit,3.966673
short,4.016435


In [7]:
# ---  TFIDF  ---
# tf-idf scores
tf_idf_vector=tfidf_transformer.transform(word_count_vector)

feature_names = cv.get_feature_names()
 
#get tfidf vector for first document
first_document_vector=tf_idf_vector[0]
 
#print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df = df.sort_values(by=["tfidf"],ascending=False)
df.head(10)

Unnamed: 0,tfidf
sless,0.640754
flippy,0.607902
mini,0.386706
dress,0.265232
0002,0.0
prem,0.0
prairie,0.0
power,0.0
postgame,0.0
popover,0.0


## TFIDF Embedding

In [22]:
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True, 
                                 stop_words="english", 
                                 analyzer='word', 
                                 ngram_range=(1, 1))

docs = map(''.join, fashion_df[['description']].values.tolist())

# just send in all your docs here
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)

print('Length of vocab: ', len(tfidf_vectorizer.get_feature_names()))
print('Sample of vocab', tfidf_vectorizer.get_feature_names()[:]) # Sample of features

Length of vocab:  1538


# Data Processing and Split data

## Split data to Train and Validation Set

In [9]:
# 80% Training 20% Validation
_X_train, _X_test, y_train, y_test = train_test_split(fashion_df['description'], fashion_df['category'], test_size=0.2)

## Include TFIDF to our model

In [14]:
vect = TfidfVectorizer(ngram_range=(1, 2), 
                       stop_words='english', 
                       analyzer='word')
vect.fit(_X_train)

# _X_train and _X_test  --- vectorized ---> X_train and X_test
X_train = vect.transform(_X_train)
X_test = vect.transform(_X_test)

print(X_train.shape)
print(X_test.shape)

(1600, 4728)
(400, 4728)


# Training using Deep Neural Network