In [1]:
import pandas as pd
import numpy as np
import math

### [0]

In [2]:
url_imdb = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/imdb.csv'
data_imdb = pd.read_csv(url_imdb)

In [3]:
base = data_imdb.dropna()

### [1]

In [4]:
d1_scifi = base[base['SciFi'] == 1]
d1_nonscifi = base[base['SciFi'] != 1]

In [5]:
from scipy.stats import ttest_ind

t_val, p_val = ttest_ind(d1_scifi['imdbRating'], d1_nonscifi['imdbRating'],
                         equal_var=False)

print(f"{math.floor(abs(t_val) * 1000) / 1000}")

9.792


### [2]

In [6]:
d2_cnt = base.groupby('year').agg({'tid': 'count'})
d2_cnt = d2_cnt[d2_cnt['tid'] >= 50]
d2_cnt = d2_cnt.reset_index()

d2_mv = base[base['year'].isin(d2_cnt['year'])]

In [7]:
d2_year = d2_mv.groupby('year').agg({'duration': 'mean', 'imdbRating': 'mean',
                           'nrOfGenre': 'mean', 'nrOfNewsArticles': 'mean',
                           'nrOfUserReviews': 'mean', 'ratingCount': 'mean'})

In [8]:
d2_norm = d2_year.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [9]:
from sklearn.cluster import KMeans

cluster = KMeans(n_clusters=7, n_init=1234, random_state=1234).fit(d2_norm)

In [10]:
label = cluster.labels_[np.where(d2_norm.index == 1977)[0][0]]
idx_samelabel = [i for i, val in enumerate(cluster.labels_) if val == label]
yr_samelabel = d2_norm.index[idx_samelabel]

In [11]:
d2_result = base[base['year'].isin(yr_samelabel)]['tid'].count()
print(f"{d2_result}")

1047


### [3]

In [12]:
X_var = ['ratingCount', 'duration', 'nrOfWins', 'nrOfNewsArticles',
         'nrOfUserReviews', 'nrOfGenre']
base = base.assign(reco=base['imdbRating'].apply(lambda x:
                                                 'Y' if x > 9 else 'N'))
episode = base[base['type'] == 'video.episode']
movie = base[base['type'] == 'video.movie']

In [13]:
from sklearn.linear_model import LogisticRegression

model_episode = LogisticRegression(penalty='l2', C=100000, random_state=1234,
                        solver='newton-cg').fit(episode[X_var], episode['reco'])
model_movie = LogisticRegression(penalty='l2', C=100000, random_state=1234,
                            solver='newton-cg').fit(movie[X_var], movie['reco'])



In [14]:
idx_episode = np.where(
                   model_episode.coef_[0] == model_episode.coef_[0].max())[0][0]
idx_movie = np.where(
                       model_movie.coef_[0] == model_movie.coef_[0].max())[0][0]

In [15]:
val_episode = [val for i, val in enumerate(X_var) if i == idx_episode]
val_movie = [val for i, val in enumerate(X_var) if i == idx_movie]

In [16]:
print(f"{val_episode[0]}, {val_movie[0]}")

nrOfUserReviews, ratingCount


### [4]

In [17]:
d4 = base[base['type'] == 'video.episode']
d4 = d4[d4['nrOfGenre'] > 1]
d4 = d4.iloc[:, 10:-1]

In [18]:
from mlxtend.frequent_patterns import apriori, association_rules

frequent_items = apriori(d4, min_support=0.01, use_colnames=True)
asso_rule = association_rules(frequent_items, metric='confidence',
                              min_threshold=0.01)

In [19]:
asso_rule = asso_rule.assign(ant_len=asso_rule['antecedents'].apply(
                                                              lambda x: len(x)))
asso_rule = asso_rule.assign(con_len=asso_rule['consequents'].apply(
                                                              lambda x: len(x)))
asso_rule = asso_rule[(asso_rule['ant_len'] == 1) & (asso_rule['con_len'] == 1)]

In [20]:
genre = list(asso_rule[asso_rule['antecedents'] == {'Animation'}].sort_values(
                                       by='lift', ascending=True).iloc[0, 1])[0]

In [21]:
conf = asso_rule[(asso_rule['antecedents'] == {'Animation'}) &
                 (asso_rule['consequents'] == {genre})]['confidence'].iloc[0]

In [22]:
print(f"{genre}, {conf:.3f}")

Drama, 0.062
