<a href="https://colab.research.google.com/github/lucyzandile/unsupervised-predict-streamlit-template/blob/master/coclustering_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

C-clustering notebook

In [45]:
pip install scikit-surprise



pip install scikit-surprise

In [46]:
# data analysis libraries
import pandas as pd
import numpy as np

# visualisation libraries
from matplotlib import pyplot as plt
import seaborn as sns
from numpy.random import RandomState


# Notebook styling
%matplotlib inline
sns.set()


# ML Models
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy

# ML Pre processing
from surprise.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Hyperparameter tuning
from surprise.model_selection import GridSearchCV

# High performance hyperparameter tuning
#from tune_sklearn import TuneSearchCV
#import warnings
#warnings.filterwarnings("ignore")

In [47]:
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/test.csv")
movies = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/movies.csv")

In [48]:
df = train.copy()

In [49]:
#create a copy of the train data
df_train = train.copy()

#display top 5 records
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [50]:
# Find the length of the unique use
len(df_train['userId'].unique()), len(df_train['movieId'].unique())

(162541, 48213)

In [51]:
# Merge
df_merge1 = train.merge(movies, on = 'movieId')
df_merge1.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,5163,57669,4.0,1518349992,In Bruges (2008),Comedy|Crime|Drama|Thriller
1,87388,57669,3.5,1237455297,In Bruges (2008),Comedy|Crime|Drama|Thriller
2,137050,57669,4.0,1425631854,In Bruges (2008),Comedy|Crime|Drama|Thriller
3,120490,57669,4.5,1408228517,In Bruges (2008),Comedy|Crime|Drama|Thriller
4,50616,57669,4.5,1446941640,In Bruges (2008),Comedy|Crime|Drama|Thriller


In [52]:
# Loading as Surprise dataframe 
reader = Reader()
data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)

In [53]:
# Data split 85/15
trainset, testset = train_test_split(data, test_size=0.15)

In [20]:
co_clust = CoClustering()

In [54]:
# Fitting our trainset
co_clust.fit(trainset)

# Using the 15% testset to make predictions
predictions = co_clust.test(testset) 
predictions

test = pd.DataFrame(predictions)

In [55]:
# View the head
test.head()

Unnamed: 0,uid,iid,r_ui,est,details
0,152992,102800,4.5,3.393764,{'was_impossible': False}
1,21139,91529,4.0,2.011547,{'was_impossible': False}
2,24004,3977,3.0,3.591408,{'was_impossible': False}
3,69754,2403,5.0,3.443428,{'was_impossible': False}
4,26518,81562,3.5,3.65595,{'was_impossible': False}


In [56]:
# We are trying to predict ratings for every userId / movieId pair, we implement the below list comprehension to achieve this.
ratings_predictions=[co_clust.predict(row.uid, row.iid) for _,row in test.iterrows()]
ratings_predictions

[Prediction(uid=152992, iid=102800, r_ui=None, est=3.3937636313167916, details={'was_impossible': False}),
 Prediction(uid=21139, iid=91529, r_ui=None, est=2.0115474081760247, details={'was_impossible': False}),
 Prediction(uid=24004, iid=3977, r_ui=None, est=3.5914080886174276, details={'was_impossible': False}),
 Prediction(uid=69754, iid=2403, r_ui=None, est=3.4434281187452145, details={'was_impossible': False}),
 Prediction(uid=26518, iid=81562, r_ui=None, est=3.655949957197733, details={'was_impossible': False}),
 Prediction(uid=71748, iid=1028, r_ui=None, est=3.526917078229193, details={'was_impossible': False}),
 Prediction(uid=77718, iid=55290, r_ui=None, est=3.5519608382381254, details={'was_impossible': False}),
 Prediction(uid=23565, iid=1544, r_ui=None, est=3.168230294073469, details={'was_impossible': False}),
 Prediction(uid=17436, iid=52722, r_ui=None, est=2.5312808302924426, details={'was_impossible': False}),
 Prediction(uid=99587, iid=1923, r_ui=None, est=3.7216477961

In [57]:
# Converting our prediction into a familiar format-Dataframe
df_pred=pd.DataFrame(ratings_predictions)
df_pred

Unnamed: 0,uid,iid,r_ui,est,details
0,152992,102800,,3.393764,{'was_impossible': False}
1,21139,91529,,2.011547,{'was_impossible': False}
2,24004,3977,,3.591408,{'was_impossible': False}
3,69754,2403,,3.443428,{'was_impossible': False}
4,26518,81562,,3.655950,{'was_impossible': False}
...,...,...,...,...,...
1500001,5692,3755,,1.976442,{'was_impossible': False}
1500002,137828,1024,,2.704224,{'was_impossible': False}
1500003,15224,1717,,2.418765,{'was_impossible': False}
1500004,115608,1734,,4.585440,{'was_impossible': False}


In [58]:
# Renaming our predictions to original names
df_pred=df_pred.rename(columns={'uid':'userId', 'iid':'movieId','est':'rating'})
df_pred.drop(['r_ui','details'],axis=1,inplace=True)

In [59]:
# Snippet of our ratings
df_pred.head()

Unnamed: 0,userId,movieId,rating
0,152992,102800,3.393764
1,21139,91529,2.011547
2,24004,3977,3.591408
3,69754,2403,3.443428
4,26518,81562,3.65595


In [60]:
# Concatenating userId/movieId into a single Id column.(code has to be run twice to get desired outcome)
df_pred['Id']=df_pred.apply(lambda x:'%s_%s' % (x['userId'],x['movieId']),axis=1)
df_pred['Id']=df_pred.apply(lambda x:'%s_%s' % (x['userId'],x['movieId']),axis=1)


In [61]:
# drop the two features from the dataset userId and movieId
df_pred.drop(['userId', 'movieId'], inplace=True, axis= 1)

In [63]:
df_pred.to_csv("coClustering_model_base.csv", index=False)

co clustering produces a score of 1.18