In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

In [None]:
target = pd.read_excel('assets/NetflixShows.xlsx')
target.head()

In [None]:
rating_description = target[['rating', 'ratingDescription']].drop_duplicates()

In [None]:
df = pd.read_csv('assets/netflix_movies_with_imdb.tsv', sep='\t')
df['rating'] = df.rating_small.combine_first(df.rating_big.rename('rating_small'))
df = pd.merge(df, rating_description, on='rating', how='left')
df.head()

In [None]:
df.shape

In [None]:
df.rating_small.unique()

In [None]:
set(df.rating_big.unique()).difference(df.rating_small.unique())

In [None]:
df.rating_big.unique()

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

ax = sns.barplot(df, x='rating', y='averageRating', 
                 hue='ratingDescription',
                 palette="coolwarm", dodge=False)

plt.legend(loc='upper center', ncols=6)

for bar in ax.containers:
    ax.bar_label(bar, fmt='%.1f', label_type='center')#, bar[0].get_height())

fig.tight_layout()
# fig.savefig('graph.png', dpi=300, facecolor='w')

In [None]:
df.averageRating.hist()

In [None]:
group = {'TV-PG':'kids', 'TV-MA':'adults', 'TV-14':'teens', 'NR':'adults', 'PG-13':'teens', 
         'TV-Y':'kids', 'R':'adults', 'TV-G':'kids',
       'PG':'kids', 'TV-Y7':'kids', 'G':'kids', 'NC-17':'adults', 'TV-Y7-FV':'kids', 'UR':'kids'}

rating_group_num = {'kids':0, 'teens':14, 'adults':18}

df['rating_group'] = df.rating.apply(lambda x: group.get(x, np.nan))
df['rating_group_num'] = df.rating_group.apply(lambda x: rating_group_num.get(x, np.nan))
df.rating_group.hist()

In [None]:
sns.kdeplot(df, x='averageRating', hue='rating_group')

In [None]:
target = df.loc[df.dataset_small == 'small']
print(target.shape)

In [None]:
full = df.loc[df.dataset_big == 'big']
print(full.shape)

In [None]:
ratings = target.groupby(['release_year', 'rating_group']).count()

In [None]:
sns.kdeplot(data=target.sort_values('rating_group_num'), x='release_year', hue='rating_group', 
            multiple='fill', clip=[1940, 2017])

In [None]:
sns.kdeplot(data=full.sort_values('rating_group_num'), x='release_year', hue='rating_group', 
            multiple='fill', clip=[1940, 2017])

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10,5))
labels = ['kids', 'teens', 'adults']

sns.kdeplot(data=target.sort_values('rating_group_num'), x='release_year', hue='rating_group', 
            multiple='fill', clip=[1940, 2017], ax=ax[0])

sns.kdeplot(data=full.sort_values('rating_group_num'), x='release_year', hue='rating_group', 
            multiple='fill', clip=[1940, 2017], ax=ax[1])

ax[0].set_title('1000 Shows Dataset')
ax[1].set_title('8000 Shows Dataset')

In [None]:
def get_pivot_table_scaled(df):
    pivot = pd.pivot_table(df, values='title', index='release_year',
                           columns='rating_group', aggfunc='count')
    pivot['total'] = pivot.sum(axis=1)
    pivot = pivot.fillna(0).astype(float)
    pivot.loc[:, 'adults':'teens'] = pivot.loc[:, 'adults':'teens'].values / pivot.total.values.reshape(-1,1) * 100
    return pivot

pivot_target = get_pivot_table_scaled(target)
pivot_full = get_pivot_table_scaled(full)
pivot_full.tail()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10,5))
labels = ['adults', 'teens', 'kids']

ax[0].set_title('1000 Shows Dataset')
ax[1].set_title('8000 Shows Dataset')

ax[0].stackplot(pivot_target.index, 
                pivot_target['adults'], pivot_target['teens'], pivot_target['kids'],
                labels=labels)

ax[1].stackplot(pivot_full.index, 
                pivot_full['adults'], pivot_full['teens'], pivot_full['kids'],
                labels=labels)


# ax[0].set_xlim([2007, 2017])
# ax[1].set_xlim([2007, 2017])

ax[0].legend()

ax[1].legend()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10,5))
labels = ['adults', 'teens', 'kids']

ax[0].set_title('1000 Shows Dataset')
ax[1].set_title('8000 Shows Dataset')

ax[0].stackplot(pivot_target.index, 
                pivot_target['adults'], pivot_target['teens'], pivot_target['kids'],
                labels=labels)

ax[1].stackplot(pivot_full.index, 
                pivot_full['adults'], pivot_full['teens'], pivot_full['kids'],
                labels=labels)


ax[0].set_xlim([2007, 2017])
ax[1].set_xlim([2007, 2017])

ax[0].legend()

ax[1].legend()

# просмотры

In [None]:
views = pd.read_csv('assets/netflix-engagement-report.csv')
print(views.shape)
views.info()

In [None]:
views = views.rename(columns={x:x.lower().replace(' ', '_').replace('?', '') for x in views.columns})
views = views.drop(columns=['unnamed:_6'])

views = views.dropna(subset=['release_date'])

views.views = views.views.str.replace(',', '').astype(int) / 1000000
views.hours_viewed = views.hours_viewed.str.replace(',', '').astype(int) / 1000000

views['release_year'] = views.release_date.apply(lambda x: x.split('-')[0]).astype(int)
# views = views.query('release_year < 2018')

views['full_title'] = views.title.copy()
views['title'] = views.full_title.apply(lambda x: x.split(':')[0])
views['season'] = views.full_title.apply(lambda x: x.split(':')[1] if ':' in x else '')

print(views.shape)
views.head()

In [None]:
views.release_year.hist()

In [None]:
views.hours_viewed.hist(bins=100)

In [None]:
sns.pairplot(views)

In [None]:
views.dtypes

In [None]:
views.describe()

In [None]:
views.shape

In [None]:
len(set(views.title).difference(set(df.title)))

In [None]:
len(set(df.title).difference(set(views.title)))

In [None]:
len(set(df.title.str.lower()).intersection(set(views.title.str.lower())))