### Course Project.

#### Цель: провести дисперсионный анализ, что есть отличия средней оценки фильмов в завиимости от жанра

In [106]:
import numpy as np
import pandas as pd
from scipy import stats

In [8]:
df_basic= pd.read_csv('basic.tsv', sep='\t', low_memory=False)
df_basic

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
7289622,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
7289623,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
7289624,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
7289625,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [136]:
df_basic['genres'].value_counts()

Drama                      759558
\N                         570823
Comedy                     518350
Talk-Show                  377837
Documentary                347340
                            ...  
Biography,Fantasy,War           1
Adult,Crime,Talk-Show           1
Biography,Crime,Music           1
Crime,War,Western               1
Adult,Animation,Musical         1
Name: genres, Length: 2263, dtype: int64

Описание датасета df_basic:

tconst (string) - alphanumeric unique identifier of the title \
titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc) \
primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release \
originalTitle (string) - original title, in the original language \
isAdult (boolean) - 0: non-adult title; 1: adult title \
startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year \
endYear (YYYY) – TV Series end year. ‘\N’ for all other title types \
runtimeMinutes – primary runtime of the title, in minutes \
genres (string array) – includes up to three genres associated with the title

У некоторых позиций наблюдается сразу несколько жанров. Необходимо будет учесть их оценку в нескольких направлениях.

In [81]:
df_rating= pd.read_csv('title_ratings.tsv', sep='\t', low_memory=False)
df_rating.columns=['id', 'averageRating', 'numVotes']
df_rating

Unnamed: 0,id,averageRating,numVotes
0,tt0000001,5.6,1656
1,tt0000002,6.1,200
2,tt0000003,6.5,1368
3,tt0000004,6.2,122
4,tt0000005,6.2,2151
...,...,...,...
1086451,tt9916580,7.2,5
1086452,tt9916690,6.6,5
1086453,tt9916720,6.0,65
1086454,tt9916766,6.9,14


Описание датасета rating:

tconst (string) - alphanumeric unique identifier of the title \
averageRating – weighted average of all the individual user ratings \
numVotes - number of votes the title has received

In [13]:
df_basic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7289627 entries, 0 to 7289626
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         int64 
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: int64(1), object(8)
memory usage: 500.5+ MB


In [14]:
df_basic['titleType'].value_counts()

tvEpisode       5250669
short            776521
movie            557038
video            282535
tvSeries         195015
tvMovie          110980
tvMiniSeries      33977
tvSpecial         30087
videoGame         26586
tvShort           26219
Name: titleType, dtype: int64

In [96]:
df_basic_movie = df_basic.loc[df_basic['titleType'] == 'movie'] # Оставляю в датасете только movie
list_of_needed_colums = ['tconst', 'genres'] # Оставляю только интересующие колонки
df_basic_movie_needed = df_basic_movie[list_of_needed_colums]
df_basic_movie_needed.columns=['id', 'genres'] # переименовываю для удобства
df_basic_movie_needed_merged = df_basic_movie_needed.merge(df_rating, on='id', how='left') # подтягивю в таблицу рейтинг по id
df_basic_movie_gender_split = df_basic_movie_needed_merged['genres'].str.split(',', expand=True) # разделяю столбцы, где несколько жанров
df_basic_movie_needed_split = pd.concat([df_basic_movie_needed_merged[['id', 'averageRating']], df_basic_movie_gender_split], axis=1) #соединяю несколько таблиц
df_basic_movie_needed_split.columns=['id', 'rating', '0', '1', '2'] # переименовываю для удобства
df_basic_movie_needed_split

Unnamed: 0,id,rating,0,1,2
0,tt0000009,5.9,Romance,,
1,tt0000335,6.1,Biography,Drama,
2,tt0000502,3.8,\N,,
3,tt0000574,6.1,Biography,Crime,Drama
4,tt0000615,4.5,Drama,,
...,...,...,...,...,...
557033,tt9916622,,Documentary,,
557034,tt9916680,,Documentary,,
557035,tt9916706,,Comedy,,
557036,tt9916730,,\N,,


In [97]:
# создаю несколько датафреймов для дальнейшего объединения с исключением значений None
df_0 = df_basic_movie_needed_split[['id', 'rating', '0']] 
df_1 = df_basic_movie_needed_split[['id', 'rating', '1']]
df_2 = df_basic_movie_needed_split[['id', 'rating', '2']]

In [101]:
df_0_needed = df_0.dropna() # исключаем значения None
df_1_needed = df_1.dropna() 
df_2_needed = df_2.dropna()
df_0_needed.columns=['id', 'rating', 'genre'] # переименовываем для сливания
df_1_needed.columns=['id', 'rating', 'genre']
df_2_needed.columns=['id', 'rating', 'genre']

In [121]:
df_genre = pd.concat([df_0_needed, df_1_needed, df_2_needed], axis=0) # сливаем в итоговую таблицу
df_genre = df_genre[['rating', 'genre']] # можем убрать id после слияния
df_genre

Unnamed: 0,rating,genre
0,5.9,Romance
1,6.1,Biography
2,3.8,\N
3,6.1,Biography
4,4.5,Drama
...,...,...
556737,6.8,Crime
556936,8.0,Comedy
557019,3.6,History
557029,6.3,Horror


In [122]:
df_genre['genre'].value_counts()# смотрим какие бывают жанры

Drama          113028
Comedy          64092
Documentary     36210
Romance         28766
Action          27497
Crime           21801
Thriller        20527
Horror          17783
Adventure       16951
\N              10235
Family           9712
Mystery          9339
Biography        7575
Fantasy          7450
History          6887
Music            6162
Sci-Fi           6103
Musical          5745
War              5595
Animation        4338
Western          4127
Adult            3864
Sport            2941
Film-Noir         786
News              649
Reality-TV         31
Short              15
Talk-Show           5
Game-Show           1
Name: genre, dtype: int64

Непонятный жанр \N. От него у меня не получилось избавиться технически, поэтому я просто не включаю его в расчеты в дальнейшем.

Создаем список, каждый индекс которого - оценки одного жанра.

In [126]:
genre_list = df_genre['genre'].value_counts().index # создаем список с жанрами
list_of_ratings_under_genre = []
for i in range(len(genre_list)):
    df_genre_type = np.array(df_genre.loc[df_genre['genre'] == f'{genre_list[i]}']['rating']) # выделяем оценки каждого жанра
    list_of_ratings_under_genre.append(df_genre_type) # добавляем в список
list_of_ratings_under_genre

[array([4.5, 3.2, 4.9, ..., 9. , 5.7, 4.4]),
 array([4.3, 3.6, 4.5, ..., 6.2, 6.3, 8. ]),
 array([7.4, 5.6, 7.1, ..., 6.3, 7.9, 5.4]),
 array([5.9, 5.3, 6. , ..., 6.8, 5.9, 5.1]),
 array([5.7, 6.4, 7. , ..., 5.1, 8.5, 8. ]),
 array([4.7, 6. , 7. , ..., 7.4, 7.8, 6.8]),
 array([6.4, 7.3, 7.3, ..., 7.2, 7.7, 3.8]),
 array([6.8, 5.2, 7.2, ..., 6.2, 6.1, 6.3]),
 array([5.2, 3.1, 7.1, ..., 5.1, 8. , 6.6]),
 array([3.8, 4.5, 4.8, ..., 7.4, 7. , 7.4]),
 array([7. , 6.9, 6.7, ..., 7. , 5.7, 6.1]),
 array([5.8, 6.4, 5.4, ..., 8. , 8.5, 7.2]),
 array([6.1, 6.1, 5.2, ..., 3.9, 7.5, 5.5]),
 array([6.4, 7.4, 6.8, ..., 3.5, 4.9, 5.1]),
 array([6.6, 5.4, 6.4, ..., 6.3, 7.2, 3.6]),
 array([6.1, 6.7, 7.3, ..., 6.7, 8.5, 7.8]),
 array([3.8, 4.7, 5.6, ..., 3.8, 5. , 7.8]),
 array([4.7, 8.5, 7. , ..., 6.8, 4.6, 1.5]),
 array([7.7, 6.4, 6. , ..., 5.9, 6.5, 4.1]),
 array([6.7, 5.4, 6.7, ..., 6.6, 6.1, 5.1]),
 array([6.1, 7.2, 6.9, ..., 4.7, 7.5, 8. ]),
 array([5.3, 5.8, 6.2, ..., 7.1, 7.7, 6.9]),
 array([5.

In [131]:
genre_list[9]  # элекмент номер 9 не будет участвовать в расчетах

'\\N'

In [133]:
len(list_of_ratings_under_genre) - 1

28

In [134]:
k = 28
df_between = k - 1
n = 438215 - 10235
df_inside = n - k

print(df_between, df_inside)

27 427952


In [137]:
K_fisher_krit = 1.03 # для таких больших значений (бесконечных) я нашла этот показатель при a = 0.05

In [132]:
stats.f_oneway(list_of_ratings_under_genre[0], list_of_ratings_under_genre[1], list_of_ratings_under_genre[2], list_of_ratings_under_genre[3], list_of_ratings_under_genre[4], list_of_ratings_under_genre[5], list_of_ratings_under_genre[6], list_of_ratings_under_genre[7], list_of_ratings_under_genre[8], list_of_ratings_under_genre[10], list_of_ratings_under_genre[11], list_of_ratings_under_genre[12], list_of_ratings_under_genre[13], list_of_ratings_under_genre[14], list_of_ratings_under_genre[15], list_of_ratings_under_genre[16], list_of_ratings_under_genre[17], list_of_ratings_under_genre[18], list_of_ratings_under_genre[19], list_of_ratings_under_genre[20], list_of_ratings_under_genre[21], list_of_ratings_under_genre[22], list_of_ratings_under_genre[23], list_of_ratings_under_genre[24], list_of_ratings_under_genre[25], list_of_ratings_under_genre[26], list_of_ratings_under_genre[27], list_of_ratings_under_genre[28])

F_onewayResult(statistic=2568.051327317674, pvalue=0.0)

2455.24 > 1.03. Различия по средним оценкам среди разных жанров есть при уровне значимости a = 0.05