<a href="https://colab.research.google.com/github/nullpitch-dev/DS_L1_Notebooks/blob/master/DS_L1_EX_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

# loading data from a csv fle
url = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/imdb.csv'
df = pd.read_csv(url)

# removing null values(records)
df_clean = df.dropna(how='any')


In [0]:
# [1]
from scipy.stats import ttest_ind

df_scifi = df_clean[df_clean['SciFi'] == 1]
df_nonscifi = df_clean[df_clean['SciFi'] == 0]
stat, p = ttest_ind(df_scifi['imdbRating'], df_nonscifi['imdbRating'],
                    equal_var=False)
print(f'T-value = {stat}, p-value = {p}')

T-value = -9.792416636544313, p-value = 1.0987311961490895e-21


In [0]:
# [2]
from sklearn.cluster import KMeans

yr_50_cnt = df_clean.groupby('year').count()
yr_50_cnt = yr_50_cnt[yr_50_cnt['tid'] >= 50]

mv_50 = df_clean[df_clean['year'] == 0]
for i in yr_50_cnt.index:
    df_temp = df_clean[df_clean['year'] == i]
    mv_50 = mv_50.append(df_temp, ignore_index=False)

yr_mean = mv_50.groupby('year').mean()
yr_mean = yr_mean[['duration', 'imdbRating', 'nrOfGenre', 'nrOfNewsArticles',
                   'nrOfUserReviews', 'ratingCount']]

yr_norm = (yr_mean - yr_mean.min()) / (yr_mean.max() - yr_mean.min())

kmeans = KMeans(n_clusters=7, random_state=1234, n_init=1234).fit(yr_norm)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=yr_norm.columns)

clu_1977 = kmeans.predict([yr_norm.loc[1977].to_numpy()])[0]
print(f'year 1977 is in cluster #{clu_1977 + 1}\n')

j = 0
mv_count = 0
for i in kmeans.labels_:
    if i == clu_1977:
        year_same_clus = yr_norm.iloc[j].name
        counts = mv_50[mv_50['year'] == year_same_clus].count()['tid']
        mv_count += counts
    j += 1

print(f'Total movies in cluster #{clu_1977 + 1} is {mv_count}')


year 1977 is in cluster #5

Total movies in cluster #5 is 1047


In [0]:
# [3]
#from statsmodels.formula.api import ols
from sklearn.linear_model import LogisticRegression

episode = df_clean[df_clean['type'] == 'video.episode']
movie = df_clean[df_clean['type'] == 'video.movie']
tv = df_clean[df_clean['type'] == 'video.tv']

episode_x = episode[['ratingCount', 'duration', 'nrOfWins', 'nrOfNewsArticles',
                     'nrOfUserReviews', 'nrOfGenre']]
episode_y = episode[['imdbRating']]
#episode_x = (episode_x - episode_x.min()) / (episode_x.max() - episode_x.min())
episode_y = episode_y['imdbRating'].apply(lambda x: 1 if x > 9 else 0)

movie_x = movie[['ratingCount', 'duration', 'nrOfWins', 'nrOfNewsArticles',
                 'nrOfUserReviews', 'nrOfGenre']]
movie_y = movie[['imdbRating']]
#movie_x = (movie_x - movie_x.min()) / (movie_x.max() - movie_x.min())
movie_y = movie_y['imdbRating'].apply(lambda x: 1 if x > 9 else 0)

tv_x = tv[['ratingCount', 'duration', 'nrOfWins', 'nrOfNewsArticles',
           'nrOfUserReviews', 'nrOfGenre']]
tv_y = tv[['imdbRating']]
#tv_x = (tv_x - tv_x.min()) / (tv_x.max() - tv_x.min())
tv_y = tv_y['imdbRating'].apply(lambda x: 1 if x > 9 else 0)

lr_episode = LogisticRegression(C=100000, random_state=1234, penalty='l2', solver='newton-cg')
lr_movie = LogisticRegression(C=100000, random_state=1234, penalty='l2', solver='newton-cg')
lr_tv = LogisticRegression(C=100000, random_state=1234, penalty='l2', solver='newton-cg')

result_episode = lr_episode.fit(episode_x, episode_y)
result_movie = lr_movie.fit(movie_x, movie_y)
result_tv = lr_tv.fit(tv_x, tv_y)


coef_episode = pd.DataFrame(data=np.exp(result_episode.coef_[0]).tolist(),
                            index=['ratingCount', 'duration', 'nrOfWins',
                                   'nrOfNewsArticles', 'nrOfUserReviews',
                                   'nrOfGenre'], columns=['coef'])
coef_movie = pd.DataFrame(data=np.exp(result_movie.coef_[0]).tolist(),
                            index=['ratingCount', 'duration', 'nrOfWins',
                                   'nrOfNewsArticles', 'nrOfUserReviews',
                                   'nrOfGenre'], columns=['coef'])
coef_tv = pd.DataFrame(data=np.exp(result_tv.coef_[0]).tolist(),
                            index=['ratingCount', 'duration', 'nrOfWins',
                                   'nrOfNewsArticles', 'nrOfUserReviews',
                                   'nrOfGenre'], columns=['coef'])

ans_episode = coef_episode.sort_values('coef', ascending=False).index[0]
ans_movie = coef_movie.sort_values('coef', ascending=False).index[0]
ans_tv = coef_tv.sort_values('coef', ascending=False).index[0]

print(f'video.episode : {ans_episode}')
print(f'video.movie : {ans_movie}')
print(f'video.tv : {ans_tv}')




video.episode : nrOfUserReviews
video.movie : ratingCount
video.tv : nrOfUserReviews




In [0]:
# [4]
from mlxtend.frequent_patterns import apriori, association_rules

# Data pre-processing
episode = df_clean[df_clean['type'] == 'video.episode']
episode = episode[episode['nrOfGenre'] != 1]
episode = episode.iloc[:, 10:]

freq_items = apriori(episode, min_support=0.01, use_colnames=True)
rules = association_rules(freq_items, metric='confidence', min_threshold=0.01)
rules['check'] = rules.apply(lambda x: 1 if len(x.antecedents) == 1 & len(x.consequents) == 1 else 0, axis=1)
rules = rules[rules['check'] == 1]

ans_genre = list(rules[rules['antecedents'] == {'Animation'}].sort_values('confidence').iloc[0].consequents)[0]
ans_confidence = round(rules[rules['antecedents'] == {'Animation'}].sort_values('confidence').iloc[0].confidence, 3)

print(ans_genre)
print(ans_confidence)

Drama
0.062
