## DataMining Assignment 2

Import a Python module named codeUtils using the alias cu. This module contains utility functions and methods that we'll use throughout our data mining project. Using an alias like cu makes it easier to reference the functions within the module in your subsequent code.

In [27]:
import codeUtils as cu
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.cluster import KMeans

Loading our four datasets

In [15]:
movies = cu.load_data('data/movies.csv')
ratings = cu.load_data('data/ratings.csv')
tags = cu.load_data('data/tags.csv')
links = cu.load_data('data/links.csv')

In [16]:
print("Total number of Movies: "+str(len(movies)))
print("Total number of Users: "+str(ratings.userId.nunique()))


Total number of Movies: 9742
Total number of Users: 610


## Managing the dataset

### Note: From README.txt
_Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970_

In our case of study this attribute has no importance, so we will get rid of it

In [17]:
cu.drop_columns(tags, ['timestamp'])
cu.drop_columns(ratings, ['timestamp'])

Merging data resulting in "merged_data" dataset contains consolidated information about movie ratings, movie details, and user-generated tags, which will be used for further analysis and processing.

In [18]:
merged_data = cu.merge_data(ratings, movies, 'movieId','inner')

merged_data = cu.merge_data(merged_data, tags, ['userId','movieId'],'left')

cu.drop_na(merged_data)
cu.drop_duplicate(merged_data)


The resulting "merged_data" dataset now includes the average rating information

In [19]:
avreage_rating = cu.calculate_average(merged_data,'movieId','rating')

merged_data = cu.merge_data(merged_data, avreage_rating, 'movieId','inner')


Binarization of attributes representing each genre, allowing for easier analysis.

In [20]:
new_merged_data = cu.transform_attribute_to_multiple(merged_data, 'genres', '|')


The columns being dropped are 'genres', 'title', and '(no genres listed)'. These columns are no longer needed after transforming the 'genres' attribute into binary attributes and are therefore dropped from the dataset, in order to the dataset contains only the relevant attributes.

In [21]:
cu.drop_columns(new_merged_data, ['genres', 'title', '(no genres listed)'])

The 'tag' attribute contains categorical values that are transformed into numerical labels.

In [22]:
cu.drop_columns(new_merged_data, ['tag'])



In [23]:
new_merged_data['total_genres'] = new_merged_data.iloc[:, 4:].sum(axis=1)
new_merged_data


Unnamed: 0,userId,movieId,rating,average_rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Drama,War,Sci-Fi,Western,Horror,Musical,Film-Noir,IMAX,Documentary,total_genres
0,336,1,4.0,3.833333,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,5
1,474,1,4.0,3.833333,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,5
2,567,1,3.5,3.833333,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,5
3,289,3,2.5,2.500000,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,2
4,289,3,2.5,2.500000,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,567,170945,3.5,3.500000,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,3
3472,567,176419,3.0,3.000000,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,4
3473,567,176419,3.0,3.000000,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,4
3474,567,176419,3.0,3.000000,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,4


In [24]:
new_merged_data.fillna(0, inplace=True)
genre_columns = new_merged_data.columns[4:-1].to_list()
numerical_features = ['rating', 'average_rating', 'total_genres']
scaler = StandardScaler()
new_merged_data[numerical_features] = scaler.fit_transform(new_merged_data[numerical_features])

In [25]:
def rule_based_rating(user_id, movie_id):
    user_data = new_merged_data[new_merged_data['userId'] == user_id]
    movie_data = new_merged_data[new_merged_data['movieId'] == movie_id]
    
    if user_data.empty or movie_data.empty:
        return None  
    
    
    user_genre_preferences = user_data[genre_columns].mean()
    movie_genres = movie_data[genre_columns].iloc[0]
    
    score = (user_genre_preferences * movie_genres).sum()
    average_user_rating = user_data['rating'].mean()
    
    rating = score + average_user_rating
    return rating

In [28]:
def clustering_based_rating(user_id, movie_id):
    user_features = new_merged_data[['userId'] + numerical_features + genre_columns].drop_duplicates()
    
    if user_features.empty:
        return None  
    
    
    kmeans = KMeans(n_clusters=10, random_state=42)
    kmeans.fit(user_features.drop(columns=['userId']))
    
    user_data = user_features[user_features['userId'] == user_id]
    if user_data.empty:
        return None  
    
    user_cluster = kmeans.predict(user_data.drop(columns=['userId']))
    cluster_center = kmeans.cluster_centers_[user_cluster]
    
    movie_features = new_merged_data[['movieId'] + numerical_features + genre_columns].drop_duplicates()
    if movie_features.empty:
        return None  
    
    movie_data = movie_features[movie_features['movieId'] == movie_id]
    if movie_data.empty:
        return None  
    
    movie_features['cluster_distance'] = pairwise_distances_argmin_min(movie_features.drop(columns=['movieId']), cluster_center)[0]
    
    closest_movie = movie_features.loc[movie_features['cluster_distance'].idxmin()]
    predicted_rating = closest_movie['rating']
    return predicted_rating

In [None]:
def combined_rating(user_id, movie_id):
    rule_rating = rule_based_rating(user_id, movie_id)
    clustering_rating = clustering_based_rating(user_id, movie_id)
    
    if rule_rating is None or clustering_rating is None:
        return None  
    
    combined_rating = (rule_rating + clustering_rating) / 2
    denormalized_rating = combined_rating * scaler.scale_[0] + scaler.mean_[0]
    return denormalized_rating

In [13]:
user_id = 336
movie_id = 1
predicted_rating = cu.combined_rating(user_id, movie_id, train_df, test_df,numerical_features=numerical_features, genre_columns=genre_columns)
print(f'Predicted rating for user {user_id} and movie {movie_id} is {predicted_rating}')

  super()._check_params_vs_input(X, default_n_init=10)
found 0 physical cores < 1
  File "c:\Users\pc\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 245, in _count_physical_cores
    raise ValueError(


AttributeError: 'StandardScaler' object has no attribute 'scale_'

## Normalization and Standarization

### 1. Normalization

By applying Min-max

In [10]:
normalized_data = cu.normalize_data(new_merged_data, ['rating', 'average_rating', 'tag'])

normalized_data

Unnamed: 0,userId,movieId,rating,tag,average_rating,Adventure,Animation,Children,Comedy,Fantasy,...,Action,Drama,War,Sci-Fi,Western,Horror,Musical,Film-Noir,IMAX,Documentary
0,336,1,0.777778,0.000000,0.740741,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,474,1,0.777778,0.000000,0.740741,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,567,1,0.666667,0.000649,0.740741,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,289,3,0.444444,0.001297,0.444444,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,289,3,0.444444,0.001946,0.444444,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,567,170945,0.666667,0.998054,0.666667,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3472,567,176419,0.555556,0.998703,0.555556,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3473,567,176419,0.555556,0.999351,0.555556,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3474,567,176419,0.555556,1.000000,0.555556,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


### 2. Standarization

By applying StandardScaler

In [11]:
standrized_data = cu.standardize_data(new_merged_data, ['rating', 'average_rating', 'tag'])

standrized_data

Unnamed: 0,userId,movieId,rating,tag,average_rating,Adventure,Animation,Children,Comedy,Fantasy,...,Action,Drama,War,Sci-Fi,Western,Horror,Musical,Film-Noir,IMAX,Documentary
0,336,1,-0.019642,-1.339026,-0.224722,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,474,1,-0.019642,-1.339026,-0.224722,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,567,1,-0.603208,-1.336675,-0.224722,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,289,3,-1.770339,-1.334324,-1.857613,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,289,3,-1.770339,-1.331973,-1.857613,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3471,567,170945,-0.603208,2.279183,-0.632945,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3472,567,176419,-1.186773,2.281534,-1.245279,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3473,567,176419,-1.186773,2.283885,-1.245279,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3474,567,176419,-1.186773,2.286236,-1.245279,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


## Split the data into training and test datasets

In [12]:
nrm_train, nrm_test = cu.split_data(normalized_data)

nrm_train.shape, nrm_test.shape

((2780, 24), (696, 24))

In [13]:
std_train, std_test = cu.split_data(standrized_data)

std_train.shape, std_test.shape

((2780, 24), (696, 24))