**Importing Libraries**
--






In [None]:
import pandas as pd
import numpy as np
import nltk  
import random  
import string 
from sklearn.base import TransformerMixin

**Reading Data From CSV File**
--

In [None]:
data = pd.read_csv('flipkart.csv', low_memory=False)

**Printing DataFrame**
--

In [None]:
display(data)

**Cleaning Messy Column Names**
--

In [None]:
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
data.columns

**Creating a NEW dataframe from acutal dataframe by selecting specfic columns**
--

In [None]:
df = data[['brand','description']]

**Displaying the NEW dataframe**
--

In [None]:
display(df)

**Removing Special Characters from Dataframe**
--

In [None]:
# Replacing dataFrame in another Variable.
df_spsl = df

# Declaration all Sorts of Special Characters in the form of List.
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–", "//", "%*", ":/", ".;", "Ø", "§"]

# Checking
for c in spec_chars:
    c1 = "\\" + c
    # Replacing With SPACE"".
    df_spsl=df_spsl.replace(c1,"", regex=True)

# Replacing DataFrame with no Special characters.
df1 = df_spsl
display(df1)

**Counting NAN values in dataframe**
--

In [None]:
df1.isna().sum()

**Creating a custom class to Impute Null values using TransformerMixin**
--

In [None]:
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

**Applying Imputation on a Dataframe**
--

In [None]:
df = DataFrameImputer().fit_transform(df1)
display(df)

**Checking for NA values again**
--

In [None]:
df.isna().sum()
df.shape[0]

In [None]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
rows = df.shape[0]
for i in range(0, rows):
    des = re.sub('[^a-zA-Z]', ' ', df['description'][i])
    des = des.lower()
    des = des.split()
    ps = PorterStemmer()
    des = [ps.stem(word) for word in des if not word in set(stopwords.words('english'))]
    des = ' '.join(des)
    corpus.append(des)

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 10)
X = cv.fit_transform(corpus).toarray()
X

** With Keras **

In [None]:
import pandas as pd
from nltk import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import download

# download('stopwords')

df = pd.read_csv("final_text_db.csv", encoding='utf-8')

class Preprocess():
	def __init__(self, text, sw=stopwords.words('english'), lower=True, stem = True):

		if not (type(text)==pd.core.series.Series):
			text = pd.Series(text)

		self.text = text
		self.sw = sw
		self.lower = lower
		self.stem = stem


	def clean_text(self):
		def stem(word_list):
			return map(lambda x: PorterStemmer().stem(x), word_list)

		def remove_sw(word_list):
			keep = []
			for word in word_list:
				if not word in self.sw:
					keep.append(word)
			return keep

		if self.lower:
			self.text = self.text.str.lower()

		self.text = self.text.apply(lambda x: x.split())
		
		if self.stem: self.text = self.text.apply(stem)
		if self.sw: self.text = self.text.apply(remove_sw)

		self.text = self.text.apply(lambda x: ' '.join(x))
		self.vectorizer = TfidfVectorizer()
		self.df_dense = self.vectorizer.fit_transform(self.text)

	def array(self, onehot=1):
		array = self.df_dense.toarray().copy()
		if onehot:
			array[array>0] = 1
		return array

	def make_df(self,onehot=1):
		df = pd.DataFrame(columns=self.vectorizer.get_feature_names(),
							data = self.array(onehot))
		df['Text'] = self.text
		df = df[['Text']+list(df.columns[:-1])]
		return df

docs = Preprocess(df.QUOTE)
docs.clean_text()
text_df = docs.make_df(onehot=1)