# Natural Language Processing

In [4]:
# importing required libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from gensim.models.word2vec import Word2Vec
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import scale
from sklearn.ensemble import GradientBoostingClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GIGA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# loading sms data
data = pd.read_csv('spam.csv', encoding='latin-1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [6]:
# checking the distribution of the target variable
data["v1"].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [7]:
# pre processing text
def text_pre_processing(sms):
    
    # removing punctuations
    sms_wo_punct = [x for x in sms if x not in string.punctuation]
    sms_wo_punct = ''.join(sms_wo_punct)
    
    # keeping only english letters
    sms_wo_punct_only_words = [x for x in sms_wo_punct if x.isalpha() or x == " "]
    sms_wo_punct_only_words = ''.join(sms_wo_punct_only_words)
    
    # converting the data to lower case
    sms_wo_punct_only_words_lc = sms_wo_punct_only_words.lower()
    
    # removing stopwords
    sms_wo_punct_only_words_lc = sms_wo_punct_only_words_lc.split(" ")
    sms_wo_punct_only_words_lc_wo_sw = [x for x in sms_wo_punct_only_words_lc if x not in stopwords.words('english')]
    sms_wo_punct_only_words_lc_wo_sw = ' '.join(sms_wo_punct_only_words_lc_wo_sw)
    
    return sms_wo_punct_only_words_lc_wo_sw

In [8]:
# pre processed sms
data_processed = pd.DataFrame(columns = ["pre_processed_sms"])
for i in tqdm(data["v2"]):
    data_processed = data_processed.append({"pre_processed_sms": text_pre_processing(i)}, ignore_index = True)
data_processed

100%|█████████████████████████████████████████████████████████████████████████████| 5572/5572 [00:39<00:00, 141.78it/s]


Unnamed: 0,pre_processed_sms
0,go jurong point crazy available bugis n great ...
1,ok lar joking wif u oni
2,free entry wkly comp win fa cup final tkts st...
3,u dun say early hor u c already say
4,nah dont think goes usf lives around though
...,...
5567,nd time tried contact u u å pound prize clai...
5568,ì b going esplanade fr home
5569,pity mood soany suggestions
5570,guy bitching acted like id interested buying s...


In [9]:
# Splitting into train and test
x_train, x_test, y_train, y_test = train_test_split(data_processed, data["v1"], test_size = 0.20, random_state = 100)
x_train = x_train.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [10]:
# Using word to vec
alg = Word2Vec(size = 500, min_count = 1)
alg.build_vocab(x_train)
alg.train(x_train, total_examples = len(x_train), epochs = alg.epochs)

TypeError: __init__() got an unexpected keyword argument 'size'

In [None]:
# converting words to vector
def convert_word_to_vector(sms, size):
    vec = np.zeros(500).reshape((1, 500))
    count = 0
    for word in sms:
        try:
            vec += alg[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
# converting training and testing to vectors
x_train_vecs = scale(np.concatenate([convert_word_to_vector(z, 500) for z in x_train["pre_processed_sms"]]))
x_test_vecs = scale(np.concatenate([convert_word_to_vector(z, 500) for z in x_test["pre_processed_sms"]]))

In [None]:
# training a gradient boosting classifier
clf = GradientBoostingClassifier(n_estimators = 100, max_depth = 3, random_state = 100)
clf.fit(x_train_vecs, y_train)

In [None]:
# training confusion matrix
y_train_pred = clf.predict(x_train_vecs)
confusion_matrix(y_train,y_train_pred)

In [None]:
# testing confusion matrix
y_test_pred = clf.predict(x_test_vecs)
confusion_matrix(y_test,y_test_pred)

# Recommender Systems

In [None]:
# loading the dataset
df = pd.read_csv('movies_metadata.csv')
df

In [None]:
# checking percentage null for each columns
round(100*(df.isnull().sum()/len(df.index)), 2)

In [None]:
# checking the information about each column
df.info()

In [None]:
# creating a budget - revenue column to calculate profit
# but as visible above, the budget column is of object type which is needed to be converted to float first
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df.info()

In [None]:
df['profit'] = df['budget'] - df['revenue']

In [None]:
# checking the top profit movies
df = df.sort_values('profit', ascending = False)
df