<a href="https://colab.research.google.com/github/nullpitch-dev/DS_L1_Notebooks/blob/master/DS_L1_EX_10_2nd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

url = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/imdb.csv'
data = pd.read_csv(url)

In [0]:
# [0]

data = data.dropna()

In [53]:
# [1] ttest_ind

from scipy import stats
import math

scifi_1 = data[data.SciFi == 1]
scifi_0 = data[data.SciFi == 0]

t_val, p_val = stats.ttest_ind(scifi_1.imdbRating, scifi_0.imdbRating,
                               equal_var=False)

print(f'Answer [1] : {math.floor(abs(t_val) * 1000) / 1000}')

Answer [1] : 9.792


In [54]:
# [2] KMeans, list comprehension, enumerate

from sklearn.cluster import KMeans
import numpy as np

# find years with more than 50 movies
years = data.groupby('year').agg({'tid': 'count'})
years = years[years.tid >= 50]

# select movies in the years
data2 = data[data.year.isin(years.index)]

# create mean table
mean = data2.groupby('year')['duration', 'imdbRating', 'nrOfGenre',
                             'nrOfNewsArticles', 'nrOfUserReviews',
                             'ratingCount'].mean()

# Normalization
norm = mean.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# KMeans
cluster = KMeans(n_clusters=7, n_init=1234, random_state=1234).fit(norm)

# find labels of 1977 
label = cluster.labels_[np.where(norm.index == 1977)]

# find years with the same label of 1977
y_index = [i for i, v in enumerate(cluster.labels_) if v == label[0]]
same_label_years = norm.index[y_index]

# count all movies with same_label_years
count = data2[data2.year.isin(same_label_years)].tid.count()

print(f'Answer [2] : {count}')

Answer [2] : 1047


In [55]:
# [3] LogisticRegression, Odds ratio, list comprehension, enumerate

from sklearn.linear_model import LogisticRegression
import numpy as np

# select x variables
X_cols = list(data.columns[2:4]) + list(data.columns[6:10])

# create recommendation column
data = data.assign(reco=data.imdbRating.apply(lambda x: 1 if x > 9 else 0))

# prepare training set
movie_X = data[data.type == 'video.movie'][X_cols]
movie_y = data[data.type == 'video.movie'].reco
episode_X = data[data.type == 'video.episode'][X_cols]
episode_y = data[data.type == 'video.episode'].reco
tv_X = data[data.type == 'video.tv'][X_cols]
tv_y = data[data.type == 'video.tv'].reco

# training
lr_movie = LogisticRegression(C=100000, random_state=1234, penalty='l2',
                              solver='newton-cg').fit(movie_X, movie_y)
lr_episode = LogisticRegression(C=100000, random_state=1234, penalty='l2',
                                solver='newton-cg').fit(episode_X, episode_y)
lr_tv = LogisticRegression(C=100000, random_state=1234, penalty='l2',
                           solver='newton-cg').fit(tv_X, tv_y)

# find variable index with max Odds ratio (exp of coefficient)
index_episode = [i for i, v in enumerate(np.exp(lr_episode.coef_)[0])
                   if v == np.exp(lr_episode.coef_.max())]
index_movie = [i for i, v in enumerate(np.exp(lr_movie.coef_)[0])
                   if v == np.exp(lr_movie.coef_.max())]

# find variable in the found index
var_episode = X_cols[index_episode[0]]
var_movie = X_cols[index_movie[0]]

print(f'Answer [3] : {var_episode}, {var_movie}')



Answer [3] : nrOfUserReviews, ratingCount




In [56]:
# [4] apriori, association_rules

from mlxtend.frequent_patterns import apriori, association_rules

# include video.episode and exlcude nrOfGenre == 1
data4 = data[data.type == 'video.episode']
data4 = data4[data4.nrOfGenre != 1]

# select genre columns
cols = data4.columns[10:-1]

# DataFrame is already in right format, no need TransactionEncoder
frequent_items = apriori(data4[cols], min_support=0.01, use_colnames=True)
asso = association_rules(frequent_items, metric='confidence',
                         min_threshold=0.01)

# select rules for 1 antecedent and 1 consequent
asso = asso.assign(len_ant=asso.antecedents.apply(lambda x: len(x)))
asso = asso.assign(len_con=asso.consequents.apply(lambda x: len(x)))
asso = asso[(asso.len_ant == 1) & (asso.len_con == 1)]

# find lowest lift item associated with Animation
asso_ani = asso[asso.antecedents == {'Animation'}].sort_values(by='lift',
                                                             ascending=True)
lowlift = list(asso_ani.consequents.iloc[0])[0]

# find confidence of lowlift over Animation
confidence = asso_ani[asso_ani.consequents == {lowlift}].confidence.iloc[0]

print(f'Answer [4] : {lowlift}, {confidence:.3f}')

Answer [4] : Drama, 0.062
