# Predicting Movie Recommendations using ID3 algoritm 

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy
import pandas as pd
from uszipcode import SearchEngine

In [2]:
os.chdir("C:\\Users\\Mariam") 
movies=pd.read_csv('movies.dat',sep ='::', names=['MovieID','Title','Genres'],skipinitialspace=True)
ratings=pd.read_csv('ratings.dat',sep ='::', names=['UserID','MovieID','Rating','Timestamp'],skipinitialspace=True)
users=pd.read_csv('users.dat',sep ='::', names=['UserID','Gender','Age','Occupation','ZipCode'],skipinitialspace=True)

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [3]:
# Definig global variables for binning
# the user age will be devided to 4 categories child(1:12), teen(13:19), youh (19:45), Old(45:56)
Age_bins = [0, 13 , 19 , 45 , 56] 
#The Movie production year will be devided into 8 category representing 1920's 1930's 1940's etc. 
year_bins = [1918, 1930 , 1940 , 1950, 1960,  1970, 1980 , 1990 , 2000]


## Helper processing Functions 

In [4]:
# A function that does label encoding 
def encode(df,  col ):
    unique_vals = np.unique(df[col])
    for i in range(len(unique_vals)):
        df.loc[df[col] == unique_vals[i], col] = i 
    return df

In [5]:
# a function that reverse's the label encoding (retrieves the code of a specific value)
def getCode(col , val ): # retrieves the value of the instance to be detected 
    #works for tge genre 1 qne  genre 2 in movies and zip values for the states 
    ix = np.where( col== val)
    return ix[0][0]

In [6]:
# A function that does the binning for the data 
def binning (df , col , bins, bin_labels , encode = True):
    df[col]= pd.cut(df[col] , bins , labels =bin_labels  )
    return df


In [7]:
#converts column values into two values based on a specifich threshold 
def binarize(df , col , thresh , num = True):
    if (num):
        df.loc[df[col] <= thresh, col] = 0 
        df.loc[df[col] > thresh, col] = 1 
    else:
        df.loc[df[col] == thresh, col] = 0 
        df.loc[df[col] !=  0 , col] = 1 
    return df    


In [8]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
def process_Movies(movies):
    #split Movie genres 
    movies[['Genre_1','Genre_2','Genre_3','Genre_4','Genre_5','Genre_6']]=movies.Genres.str.split("|",expand=True)
    #pick top 2 genres 
    movies = movies.drop('Genres' , axis = 1)
    movies = movies.drop(['Genre_3','Genre_4','Genre_5','Genre_6'] , axis = 1)
    #retrieve movie year 
    movies['Year']=movies.Title.str.slice(start=-6)
    movies['Title']=movies.Title.str.slice(stop=-6)
    movies['Year']=movies.Year.str.slice(start=1)
    movies['Year']=movies.Year.str.slice(stop=-1)
    movies['Year']=pd.to_numeric(movies['Year'])
    #binning the movie year 
    Year_ranges = [1 , 2 , 3 , 4, 5 , 6 , 7 , 8]
    movies= binning(movies, 'Year' ,year_bins , Year_ranges )
    movies= movies.drop('Title', axis=1)
    #processing Movie genres 
    movies['Genre_2']= movies['Genre_2'].fillna('Unknown')
    movies['Genre_1']= movies['Genre_1'].fillna('Unknown')
    movies = encode(movies ,  'Genre_1' )
    movies = encode(movies ,  'Genre_2' ) 
    
    return movies
    

In [10]:
Mov = process_Movies(movies)

In [11]:
Mov.head()

Unnamed: 0,MovieID,Genre_1,Genre_2,Year
0,1,2,2,8
1,2,1,2,8
2,3,4,12,8
3,4,4,6,8
4,5,4,15,8


In [12]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [13]:
def process_ratings(ratings):
    ratings = binarize(ratings , 'Rating' , 3)
    ratings = ratings.drop('Timestamp', axis =1 )
    return ratings 

In [14]:
ratings = process_ratings(ratings)

In [15]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,1
1,1,661,0
2,1,914,0
3,1,3408,1
4,1,2355,1


In [16]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,ZipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [17]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,ZipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [18]:
def retrieve_zips(col):
    search = SearchEngine(simple_zipcode=True)
    states = []
    for zipc in users['ZipCode']:
        zipcode = search.by_zipcode(zipc)
        state  = zipcode.state
        states = np.append(states, state)
    return states


In [19]:
#returns the unique states so that the user is able to select from them
def unique_states(col):
    return  np.unique(col)
    

In [20]:
def process_users(users):
    #Retrieving the Zip Code Corresponding to Every state 
    states =  retrieve_zips(users['ZipCode'])
    users['states'] = states 
    Users = users.drop('ZipCode', axis = 1 )
    Users = Users.fillna('Unknown')
    states = np.unique(Users['states'])
    Users = encode (Users, 'states')
    #Will devide the user age into 4 groups (child 1:12) teen (13:19) Adult (20:45) old (46:56)
    Age_bins = [0, 13 , 19 , 45 , 56]
    age_ranges = [1 , 2 , 3 , 4]
    Usesr= binning (Users , 'Age' , Age_bins, age_ranges , encode = True)
    Users =binarize(Users , 'Gender' , np.unique(Users['Gender'])[0], False)
    return states, Users


In [21]:
states, Users = process_users(users)
Users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,states
0,1,0,1,10,22
1,2,1,4,16,18
2,3,1,3,15,23
3,4,1,3,7,19
4,5,1,3,20,23


In [22]:
states

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN',
       'TX', 'UT', 'Unknown', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'],
      dtype=object)

### Merging Tables 

In [23]:
Usermovies = pd.merge(ratings, Mov , on='MovieID', how='inner')

In [24]:
Usermovies = pd.merge(Usermovies, Users , on='UserID', how='inner')

In [25]:
Usermovies =  Usermovies.rename(columns={"Rating": "class"})


In [26]:
Usermovies = Usermovies.drop(['UserID','MovieID'] , axis=1)

In [27]:
Usermovies.head()

Unnamed: 0,class,Genre_1,Genre_2,Year,Gender,Age,Occupation,states
0,1,7,15,6,0,1,10,22
1,0,2,2,8,0,1,10,22
2,0,11,12,5,0,1,10,22
3,1,7,15,8,0,1,10,22
4,1,2,2,8,0,1,10,22


## Training Helper functions 

In [28]:
def train_test_split(dataset):
    n =round(0.66*len(dataset))
    training_data = dataset.iloc[:n].reset_index(drop=True)#We drop the index respectively relabel the index
    #starting form 0, because we do not want to run into errors regarding the row labels / indexes
    testing_data = dataset.iloc[n:].reset_index(drop=True)
    return training_data,testing_data

training_data = train_test_split(Usermovies)[0]
testing_data = train_test_split(Usermovies)[1] 


In [29]:
training_data.head()

Unnamed: 0,class,Genre_1,Genre_2,Year,Gender,Age,Occupation,states
0,1,7,15,6,0,1,10,22
1,0,2,2,8,0,1,10,22
2,0,11,12,5,0,1,10,22
3,1,7,15,8,0,1,10,22
4,1,2,2,8,0,1,10,22


# Decision tree Implementation  (Server)

In [30]:
def entropy(target_col):
    """
    Calculate the entropy of a dataset.
    The only parameter of this function is the target_col parameter which specifies the target column
    """
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

In [31]:
def InfoGain(data,split_attribute_name,target_name="class"):
    
    # entropy of y
    total_entropy = entropy(data[target_name])
    
    
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain




In [32]:
def ID3(data,originaldata,features,target_attribute_name="class",parent_node_class = None):
    
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
        
    elif len(features) ==0:
        return parent_node_class
    
    
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
    
        tree = {best_feature:{}}
        
        
        features = [i for i in features if i != best_feature]
        
        
        for value in np.unique(data[best_feature]):
            value = value
            sub_data = data.where(data[best_feature] == value).dropna()
            
            subtree = ID3(sub_data,Usermovies,features,target_attribute_name,parent_node_class)
            
            tree[best_feature][value] = subtree
            
        return(tree)    

    

In [33]:
def predict(query,tree,default = 1):    
    for key in list(query.keys()):
        if key in list(tree.keys()):
        
            try:
                result = tree[key][query[key]] 
            except:
                return default
  
            
            result = tree[key][query[key]]
            
            if isinstance(result,dict):
                return predict(query,result)

            else:
                return result


In [34]:
def test(data,tree):
    #Create new query instances by simply removing the target feature column from the original dataset and 
    #convert it to a dictionary
    queries = data.to_dict(orient = "records")
    
    #Create a empty DataFrame in whose columns the prediction of the tree are stored
    predicted = []
    
    #Calculate the prediction accuracy
    for i in range(len(data)):
        p = predict(queries[i],tree,1.0) 
        predicted = np.append(predicted , p)
    return predicted

In [35]:
Y = Usermovies['class']
X = Usermovies.drop('class', axis=1) 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.33, random_state=42)

X_train['class'] = y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


***Running the model on the training data (While deleting the class column)***

In [36]:
tree = ID3(X_train,X_train,X_train.columns[:-1])


## Testing The Model

### Training Error

In [37]:
n_sample = 20000

In [38]:
x_train = X_train[1:n_sample]
x_train= x_train.drop('class', axis=1)

In [39]:
y = test (x_train , tree)

In [40]:
y

array([1., 1., 0., ..., 1., 0., 0.])

In [41]:
err  =  abs(y_train[1:n_sample] - y)

In [42]:
sum(err)/n_sample

0.2684

### Testing Error

In [43]:
X_test.head()

Unnamed: 0,Genre_1,Genre_2,Year,Gender,Age,Occupation,states
895536,4,6,7,1,3,14,4
899739,0,14,7,1,3,11,23
55687,4,15,8,1,2,4,49
63727,4,15,8,1,3,20,34
822011,0,0,6,1,3,12,50


In [44]:
X_test = X_test[1:n_sample]

In [45]:
y = test (X_test , tree)

In [46]:
y

array([1., 1., 0., ..., 1., 0., 1.])

In [47]:
y_test = y_test.values


In [48]:
fp = 0  
fn = 0
tp = 0 
tn = 0 
for i in range (len(y)):
    if (y[i] > y_test[i]):
        fp = fp + 1 
    if (y[i] < y_test[i]):
        fn = fn + 1
    if (y[i] == y_test[i]): 
        if (y[i]==1):
            tp = tp+1
        if (y[i] == 0):
            tn = tn+1
print("False positive", fp)
print("False Negative", fn)
print("True positive", tp)
print("True Negative", tn)

False positive 5321
False Negative 4309
True positive 7208
True Negative 3161


In [49]:
err  =  abs(y_test[1:n_sample] - y)

In [50]:
sum(y)

12529.0

In [51]:
len(y)

19999

In [52]:
sum(err)/n_sample

0.3872

## User Input Model

In [53]:
gender= str(input("Please enter your Gender F for female and M for male"))
while (gender!= 'F'and gender!= 'M' ):
    gender  = str(input("Invalid please Re-Enter"))
if (gender == 'F'):
    gender  = 0 
else:
    gender = 1 

Please enter your Gender F for female and M for maleM


In [54]:
Age = int(input("Please enter your Age"))
if (Age >= Age_bins[3]):
    Age=4
elif (Age>= Age_bins[2]):
    Age=3
elif (Age >= Age_bins[1]):
    Age =2
else:
    Age =1 
    

Please enter your Age22


In [55]:
year_bins


[1918, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000]

In [56]:
Year  = int(input("Please enter the production year Age")) 
if (Year >= year_bins[7]):
    Year=8
elif (Year >= year_bins[6]):
    Year=7
elif (Year >= year_bins[5]):
    Year=6
elif (Year >= year_bins[4]):
    Year=5
elif (Year >= year_bins[3]):
    Year=4
elif (Year >= year_bins[2]):
    Year=3
elif (Year >= year_bins[1]):
    Year=2
else:
    Year=1


Please enter the production year Age1990


In [58]:
def getMoviegenres(movies):
    movies[['Genre_1','Genre_2','Genre_3','Genre_4','Genre_5','Genre_6']]=movies.Genres.str.split("|",expand=True)
    mov = movies.fillna('Unknown')
    return np.unique(movies['Genre_1'])
print ('Choose the movie genre(s)')
genres = getMoviegenres(movies)
for i in range(len(genres)):
    print ("Choose   " + str(i) + "   for  " + str(genres[i]))
genre_1 = int(input('First Genre'))
genre_2 = int(input('Second Genre'))


Choose the movie genre(s)
Choose   0   for  Action
Choose   1   for  Adventure
Choose   2   for  Animation
Choose   3   for  Children's
Choose   4   for  Comedy
Choose   5   for  Crime
Choose   6   for  Documentary
Choose   7   for  Drama
Choose   8   for  Fantasy
Choose   9   for  Film-Noir
Choose   10   for  Horror
Choose   11   for  Musical
Choose   12   for  Mystery
Choose   13   for  Romance
Choose   14   for  Sci-Fi
Choose   15   for  Thriller
Choose   16   for  War
Choose   17   for  Western
First Genre16
Second Genre6


In [59]:
ZipCode =  str(input("Please Enter your ZipCode"))

Please Enter your ZipCode70072


In [60]:
def retrieve_zip(ZipCode):
    search = SearchEngine(simple_zipcode=True)
    zipcode = search.by_zipcode(ZipCode)
    state  = zipcode.state
    return state

In [61]:
state = retrieve_zip(ZipCode)

In [62]:
state

'LA'

In [63]:
code = getCode(states , state)


In [64]:
state = code

In [65]:

print ( "0:  other or not specified")
print ("1:  academic/educator")
print ("2:  artist")
print ("4:  clerical/admin")
print  ("4:  college/grad student")
print ("5:  customer service")
print (" 6:  doctor/health care")
print( " 7:  executive/managerial")
print (" 8:  farmer")
print ("  9:  homemaker") 
print ("10:  K-12 student") 
print ( "11:  lawyer")
print ( "12:  programmer")
print ("13:  retired")
print ("14:  sales/marketing")
print (" 15:  scientist")
print (" 16:  self-employed")
print (" 17:  technician/engineer")
print (" 18:  tradesman/craftsman")
print (" 19:  unemployed")
print (" 20:  writer")
occupation= str ( input ("please select your occupation "))

0:  other or not specified
1:  academic/educator
2:  artist
4:  clerical/admin
4:  college/grad student
5:  customer service
 6:  doctor/health care
 7:  executive/managerial
 8:  farmer
  9:  homemaker
10:  K-12 student
11:  lawyer
12:  programmer
13:  retired
14:  sales/marketing
 15:  scientist
 16:  self-employed
 17:  technician/engineer
 18:  tradesman/craftsman
 19:  unemployed
 20:  writer
please select your occupation 2


In [66]:
x = {
  'Genre_1': genre_1,
  'Genre_2': genre_2,
  'Year': Year,
  'Gender': gender ,
  'Age': Age,
  'Occupation': occupation ,
  'states': state}
x

{'Genre_1': 16,
 'Genre_2': 6,
 'Year': 8,
 'Gender': 1,
 'Age': 3,
 'Occupation': '2',
 'states': 18}

In [67]:
X_train.head()

Unnamed: 0,Genre_1,Genre_2,Year,Gender,Age,Occupation,states,class
933693,7,15,6,0,3,6,6,0
31456,15,15,5,1,4,13,50,1
136353,7,15,7,1,3,11,20,0
326102,10,14,5,1,4,3,44,1
412038,4,15,7,1,3,12,11,1


In [68]:
##send to the server
p = predict(x,tree,1.0) 


In [69]:
print(p)

1


In [80]:

queries = X_test.to_dict(orient = "records")


In [81]:
queries

[{'Genre_1': 0,
  'Genre_2': 14,
  'Year': 7,
  'Gender': 1,
  'Age': 3,
  'Occupation': 11,
  'states': 23},
 {'Genre_1': 4,
  'Genre_2': 15,
  'Year': 8,
  'Gender': 1,
  'Age': 2,
  'Occupation': 4,
  'states': 49},
 {'Genre_1': 4,
  'Genre_2': 15,
  'Year': 8,
  'Gender': 1,
  'Age': 3,
  'Occupation': 20,
  'states': 34},
 {'Genre_1': 0,
  'Genre_2': 0,
  'Year': 6,
  'Gender': 1,
  'Age': 3,
  'Occupation': 12,
  'states': 50},
 {'Genre_1': 5,
  'Genre_2': 6,
  'Year': 8,
  'Gender': 0,
  'Age': 3,
  'Occupation': 7,
  'states': 31},
 {'Genre_1': 4,
  'Genre_2': 12,
  'Year': 8,
  'Gender': 0,
  'Age': 2,
  'Occupation': 4,
  'states': 50},
 {'Genre_1': 0,
  'Genre_2': 0,
  'Year': 8,
  'Gender': 0,
  'Age': 4,
  'Occupation': 13,
  'states': 15},
 {'Genre_1': 7,
  'Genre_2': 15,
  'Year': 8,
  'Gender': 1,
  'Age': 3,
  'Occupation': 5,
  'states': 28},
 {'Genre_1': 0,
  'Genre_2': 0,
  'Year': 8,
  'Gender': 0,
  'Age': 3,
  'Occupation': 7,
  'states': 34},
 {'Genre_1': 0,
  '

In [84]:
predicts = []
for i in range(len(queries)): 
    p = predict(queries[i],tree,1.0) 
    predicts = np.append(predicts , p)

In [85]:
len(predicts)

19999

In [86]:
np.where(predicts==0)

(array([    2,     7,     8, ..., 19990, 19995, 19997], dtype=int64),)

In [None]:
def InfoGain(data,split_attribute_name,target_name="class"):
    
    # entropy of y
    total_entropy = entropy(data[target_name])
    
    
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain


In [90]:
Genre_1_info_gain = InfoGain(Usermovies, 'Genre_1' , 'class')
Genre_1_info_gain

0.013717265431952508

In [93]:
Genre_2_info_gain = InfoGain(Usermovies, 'Genre_2' , 'class')
Genre_2_info_gain

0.006061353741652953

In [100]:
Year_info_gain = InfoGain(Usermovies, 'Year' , 'class')
Year_info_gain

0.013135281431026602

In [96]:
Gender_info_gain = InfoGain(Usermovies, 'Gender' , 'class')
Gender_info_gain

0.00023443206963769114

In [98]:
Age_info_gain = InfoGain(Usermovies, 'Age' , 'class')
Age_info_gain

0.0013885352484802604

In [103]:
Occupation_info_gain = InfoGain(Usermovies, 'Occupation' , 'class')
Occupation_info_gain

0.0014983134484606309

In [107]:
state_info_gain = InfoGain(Usermovies, 'states' , 'class')
state_info_gain

0.0022105008766325485