In [1266]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter


In [1267]:
data = pd.read_csv('movie_bd_v5.csv')
data.sample(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
50,tt2140379,26000000,30523226,Self/less,Ryan Reynolds|Ben Kingsley|Natalie Martinez|Ma...,Tarsem Singh,God Created Man. Man Created Immortality.,An extremely wealthy elderly man dying from ca...,116,Science Fiction|Mystery|Thriller,Endgame Entertainment|Ram Bergman Productions,7/10/2015,6.2,2015
1281,tt2017020,105000000,347434178,The Smurfs 2,Neil Patrick Harris|Christina Ricci|Katy Perry...,Raja Gosnell,Get ready to get naughty!,The evil wizard Gargamel creates a couple of m...,105,Fantasy|Family|Comedy|Animation,Columbia Pictures|Sony Pictures Animation|Kern...,7/30/2013,5.5,2013
1848,tt0146882,30000000,47126295,High Fidelity,John Cusack|Iben Hjejle|Todd Louiso|Jack Black...,Stephen Frears,"A comedy about fear of commitment, hating your...",When record store owner Rob Gordon gets dumped...,113,Comedy|Drama|Romance|Music,Buena Vista|Touchstone Pictures,3/17/2000,7.0,2000
1844,tt0190138,41300000,106371651,The Whole Nine Yards,Bruce Willis|Matthew Perry|Rosanna Arquette|Mi...,Jonathan Lynn,"In the heart of suburbia, a hit man with heart...",A mobster named Jimmy the Tulip agrees to coop...,98,Comedy|Crime,Franchise Pictures|Warner Bros.|Morgan Creek P...,2/18/2000,6.0,2000
1490,tt0424345,5000000,26888376,Clerks II,Brian O'Halloran|Jeff Anderson|Jason Mewes|Kev...,Kevin Smith,With No Power Comes No Responsibility,A calamity at Dante and Randall's shops sends ...,97,Comedy,The Weinstein Company|View Askew Productions,5/25/2006,6.9,2006


In [1268]:
data.describe()

Unnamed: 0,budget,revenue,runtime,vote_average,release_year
count,1889.0,1889.0,1889.0,1889.0,1889.0
mean,54310830.0,155365300.0,109.658549,6.140762,2007.860773
std,48587210.0,214669800.0,18.017041,0.764763,4.468841
min,5000000.0,2033165.0,63.0,3.3,2000.0
25%,20000000.0,34560580.0,97.0,5.6,2004.0
50%,38000000.0,83615410.0,107.0,6.1,2008.0
75%,72000000.0,178262600.0,120.0,6.6,2012.0
max,380000000.0,2781506000.0,214.0,8.1,2015.0


# Предобработка

In [1269]:
answers = {} # создадим словарь для ответов

# тут другие ваши предобработки колонок например:

#the time given in the dataset is in string format.
#So we need to change this in datetime format
# ...

In [1270]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1889 entries, 0 to 1888
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               1889 non-null   object 
 1   budget                1889 non-null   int64  
 2   revenue               1889 non-null   int64  
 3   original_title        1889 non-null   object 
 4   cast                  1889 non-null   object 
 5   director              1889 non-null   object 
 6   tagline               1889 non-null   object 
 7   overview              1889 non-null   object 
 8   runtime               1889 non-null   int64  
 9   genres                1889 non-null   object 
 10  production_companies  1889 non-null   object 
 11  release_date          1889 non-null   object 
 12  vote_average          1889 non-null   float64
 13  release_year          1889 non-null   int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 206.7+ KB


### Check if there any non null values

In [1271]:
data.isnull().values.any()

False

### Check if columns of type 'object' are of the right data format

In [1272]:
object_data = data.select_dtypes(include='object').columns
object_data

Index(['imdb_id', 'original_title', 'cast', 'director', 'tagline', 'overview',
       'genres', 'production_companies', 'release_date'],
      dtype='object')

In [1273]:
for ob in object_data:
    print(type(ob),'\t', ob)

<class 'str'> 	 imdb_id
<class 'str'> 	 original_title
<class 'str'> 	 cast
<class 'str'> 	 director
<class 'str'> 	 tagline
<class 'str'> 	 overview
<class 'str'> 	 genres
<class 'str'> 	 production_companies
<class 'str'> 	 release_date


### Change columns with the wrong data format to the right one

In [1274]:
data['release_date'] = pd.to_datetime(data['release_date'])
type(data['release_date'][0])

pandas._libs.tslibs.timestamps.Timestamp

### Function splitting data into lists  

In [1275]:
def to_one_str(column):
    return column.str.cat(sep = '|').split('|')

### Function finding most_common substring in a string

In [1276]:
def most_common(column):
    return Counter(to_one_str(column)).most_common()

### Create a function collecting all answers to questions

In [1277]:
answers = {}

# id_name is True if the input contains both name and id
def answer(answer, question, id_name = False):
    if id_name:
        answers[question] = answer['original_title'].item() + ' (' + answer['imdb_id'].item() + ')'
    else: 
        answers[question] = answer
answers

{}

### Add profit data to the dataframe

In [1278]:
data['profit'] = data['revenue'] - data['budget']
data['profit']

0       1363528810
1        228436354
2        185238201
3       1868178225
4       1316249360
           ...    
1884      82299717
1885      -8444012
1886     -40865180
1887      -3962091
1888      -9782502
Name: profit, Length: 1889, dtype: int64

### Function to convert lists to pd.Series 

In [1279]:
def list_to_Series(x): 
    idx, values = zip(*x)
    return pd.Series(values, idx)

### Format the month df to make it work easier with it

In [1280]:
def format_month_df(months):
    months.rename(columns = {'release_date':'count'}, inplace = True)
    months.index.names = ['release_month']
    month_names = ['January', 'Feburary', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'Ocotber', 'November', 'December']
    months.index = map(lambda x: month_names[x-1], months.index)
    
    return months

# 1. У какого фильма из списка самый большой бюджет?

In [1281]:
max_budget = data[data['budget'] == data['budget'].max()]
answer(max_budget, 1, True)

answers[1]

'Pirates of the Caribbean: On Stranger Tides (tt1298650)'

# 2. Какой из фильмов самый длительный (в минутах)?

In [1282]:
longest = data[data['runtime'] == data['runtime'].max()]
answer(longest, 2, True)

answers[2]

'Gods and Generals (tt0279111)'

# 3. Какой из фильмов самый короткий (в минутах)?





In [1283]:
shortest = data[data['runtime'] == data['runtime'].min()]
answer(shortest, 3, True)

answers[3]

'Winnie the Pooh (tt1449283)'

# 4. Какова средняя длительность фильмов?


In [1284]:
avg_runtime = data['runtime'].mean()
answer(avg_runtime, 4)

answers[4]

109.6585494970884

# 5. Каково медианное значение длительности фильмов? 

In [1285]:
meadian_runtime = data['runtime'].median()
answer(meadian_runtime, 5)

answers[5]

107.0

# 6. Какой самый прибыльный фильм?
#### Внимание! Здесь и далее под «прибылью» или «убытками» понимается разность между сборами и бюджетом фильма. (прибыль = сборы - бюджет) в нашем датасете это будет (profit = revenue - budget) 

In [1286]:
most_profit = data[data['profit']==data['profit'].max()]
answer(most_profit, 6, True)

answers[6]

'Avatar (tt0499549)'

# 7. Какой фильм самый убыточный? 

In [1287]:
least_profit = data[data['profit']==data['profit'].min()]
answer(least_profit, 7, True)

answers[7]

'The Lone Ranger (tt1210819)'

# 8. У скольких фильмов из датасета объем сборов оказался выше бюджета?

In [1288]:
pos_profit = data[data['profit']>0].count()['imdb_id']
answer(pos_profit, 8)

answers[8]

1478

# 9. Какой фильм оказался самым кассовым в 2008 году?

In [1289]:
movies_2008 = data[(data['release_year']==2008)]
highest_gross =  movies_2008[movies_2008['revenue']==movies_2008['revenue'].max()]
answer(highest_gross, 9, True)

answers[9]

'The Dark Knight (tt0468569)'

# 10. Самый убыточный фильм за период с 2012 по 2014 г. (включительно)?


In [1290]:
period = data[(data['release_year']>=2012) | (data['release_year']<=2012)]
most_loss = period[period['revenue']==period['revenue'].min()]
answer(most_loss, 10, True)

answers[10]

'Mutant Chronicles (tt0490181)'

# 11. Какого жанра фильмов больше всего?

In [1291]:
# эту задачу тоже можно решать разными подходами, попробуй реализовать разные варианты
# если будешь добавлять функцию - выноси ее в предобработку что в начале
genres = data.groupby(['genres']).size().sort_values(ascending=False)
most_popular = genres[0]
print(genres)

answer(genres.index[0], 11)

answers[11]

genres
Comedy                            141
Drama                             102
Comedy|Romance                     60
Drama|Romance                      45
Comedy|Drama|Romance               35
                                 ... 
Drama|Mystery|Thriller|Fantasy      1
Drama|Mystery|Thriller|Horror       1
Drama|Romance|Comedy                1
Drama|Romance|Comedy|Music          1
Crime|Comedy|Romance                1
Length: 652, dtype: int64


'Comedy'

# 12. Фильмы какого жанра чаще всего становятся прибыльными? 

In [1292]:
profitable = data[data.profit > 0]
quantity = most_common(profitable['genres'])

display(quantity)

answer(quantity[0][0], 12)

[('Drama', 560),
 ('Comedy', 551),
 ('Thriller', 446),
 ('Action', 444),
 ('Adventure', 337),
 ('Romance', 242),
 ('Crime', 231),
 ('Family', 226),
 ('Science Fiction', 195),
 ('Fantasy', 188),
 ('Horror', 150),
 ('Animation', 120),
 ('Mystery', 119),
 ('Music', 47),
 ('History', 46),
 ('War', 41),
 ('Western', 12),
 ('Documentary', 7)]

# 13. У какого режиссера самые большие суммарные кассовые сборы?

In [1293]:
directors = data.groupby(['director'])
movie_shark = directors['revenue'].sum().sort_values(ascending = False)
answer(movie_shark.index[0], 13)

movie_shark.head(5)

director
Peter Jackson        6490593685
Christopher Nolan    4167548502
David Yates          4154295625
Michael Bay          3886938960
J.J. Abrams          3579169916
Name: revenue, dtype: int64

# 14. Какой режисер снял больше всего фильмов в стиле Action?

In [1294]:
action = data[data['genres'].str.contains('Action')][['director','genres']]
directs_most = list_to_Series(most_common(action['director']))

display(directs_most.head())

answer(directs_most.index[0],14)

Robert Rodriguez      9
Michael Bay           7
Paul W.S. Anderson    7
Antoine Fuqua         6
Ridley Scott          6
dtype: int64

# 15. Фильмы с каким актером принесли самые высокие кассовые сборы в 2012 году? 

In [1295]:
movies_12 = data[data['release_year']==2012][['original_title', 'cast', 'revenue']]
mean_revenue = movies_12['revenue'].mean()

top_movies = movies_12[movies_12['revenue']>mean_revenue].sort_values(['revenue'],ascending=False)
pop_actors = list_to_Series(most_common(top_movies['cast']))

display(pop_actors.head())

answer(pop_actors.index[0], 15)

Chris Hemsworth        2
Ralph Fiennes          2
Anne Hathaway          2
Denis Leary            2
Seann William Scott    2
dtype: int64

# 16. Какой актер снялся в большем количестве высокобюджетных фильмов?

In [1296]:
high_budget = data[data['budget'] > data['budget'].mean()][['original_title','cast','budget']]
most_pop_actor = list_to_Series(most_common(high_budget['cast']))

answer(most_pop_actor.index[0], 16)
most_pop_actor.head(3)


Matt Damon        18
Adam Sandler      17
Angelina Jolie    16
dtype: int64

# 17. В фильмах какого жанра больше всего снимался Nicolas Cage? 

In [1297]:
cages_movies = data[data['cast'].str.find('Nicolas Cage')>=0][['original_title', 'cast', 'genres']]


cages_pop_genre = list_to_Series(most_common(cages_movies['genres']))

answer(cages_pop_genre.index[0], 17)
cages_pop_genre.head()

Action      17
Thriller    15
Drama       12
Crime       10
Fantasy      8
dtype: int64

# 18. Самый убыточный фильм от Paramount Pictures

In [1298]:
paramount = data[data['production_companies'].str.find('Paramount Pictures')>=0][['imdb_id','original_title','profit']]
unsuccessful = paramount[paramount['profit']==paramount['profit'].min()]

answer(unsuccessful, 18, True)

# 19. Какой год стал самым успешным по суммарным кассовым сборам?

In [1313]:
revenue = data.groupby('release_year')['revenue'].sum().sort_values(ascending=False)
display(revenue.head())

answer(revenue.index[0], 19)

release_year
2015    25449202382
2014    23405862953
2013    23213799791
2012    23079001687
2011    22676791872
Name: revenue, dtype: int64

# 20. Какой самый прибыльный год для студии Warner Bros?

In [1300]:
WB_movies = data[data['production_companies'].str.find('Warner Bros')>0][['release_year','profit']]

most_profit_year = WB_movies.groupby('release_year').sum().sort_values(['profit'],ascending=False)

display(most_profit_year.head(3))

answer(most_profit_year.index[0], 20)

Unnamed: 0_level_0,profit
release_year,Unnamed: 1_level_1
2014,2297979392
2008,2134595031
2007,1928180689


# 21. В каком месяце за все годы суммарно вышло больше всего фильмов?

In [1301]:
months = data['release_date'].apply(lambda x: x.month)
months = format_month_df(months.groupby(months).count().to_frame())

highest_num = months.sort_values(by = ['count'], ascending = False).index[0]

answer(highest_num, 21)
answers[21]

'September'

# 22. Сколько суммарно вышло фильмов летом? (за июнь, июль, август)

In [1302]:
values = months[5:8]['count'].values
summer_mov = sum(values)

answer(summer_mov, 22)
answers[22]

450

# 23. Для какого режиссера зима – самое продуктивное время года? 

In [1303]:
df = data[['release_date','director']].copy()
df['release_date'] = df['release_date'].apply(lambda x: x.month)

winter = [12,1,2]
df = df[df['release_date'].isin(winter)]

### Вариант 1: использование counter

In [1304]:
winter_directors = most_common(df['director'])

answer(winter_directors[0][0], 23)
answers[23]

'Peter Jackson'

### Вариант 2: использование explode

In [1305]:
df['director'] = split(df['director'])
df = df.explode('director')

winter_directors = df.groupby(['director']).count().sort_values(by= 'release_date', ascending = False)

answer(winter_directors.iloc[0].name, 23)
answers[23]

'Peter Jackson'

# 24. Какая студия дает самые длинные названия своим фильмам по количеству символов?

### For questions 24 and 25

In [1306]:
def get_studios(col):
    studios = df.groupby(['production_companies']).median(col)
    studios = studios.sort_values(by = col, ascending = False)

    return studios

df = data['production_companies'].copy().to_frame()
df.insert(1, "title_len", list(map(lambda x: len(x), data['original_title'])), True)
df.insert(2, "overview_len", list(map(lambda x: len(x), data['overview'])), True)

df['production_companies'] = split(df['production_companies'])
df = df.explode('production_companies')





In [1307]:
studios = get_studios('title_len')

display(studios)

answer(studios.iloc[0].name, 24) 

Unnamed: 0_level_0,title_len,overview_len
production_companies,Unnamed: 1_level_1,Unnamed: 2_level_1
Four By Two Productions,83.0,298.0
"Jim Henson Company, The",59.0,304.0
Dos Corazones,47.0,150.0
Museum Canada Productions,46.0,302.0
Polsky Films,46.0,387.0
...,...,...
Everest Entertainment,3.0,124.0
Berlanti Productions,3.0,413.0
Ixtlan Productions,2.0,340.0
XM2 Productions,2.0,342.0


# 25. Описание фильмов какой студии в среднем самые длинные по количеству слов?

In [1308]:
studios = get_studios('overview_len')

display(studios)

answer(studios.iloc[0].name, 25) 

Unnamed: 0_level_0,title_len,overview_len
production_companies,Unnamed: 1_level_1,Unnamed: 2_level_1
Midnight Picture Show,23.0,1000.0
Room 9 Entertainment,21.0,964.0
Brookwell-McNamara Entertainment,11.0,936.0
Lions Gate Family Entertainment,15.0,909.0
Crest Animation Productions,15.0,909.0
...,...,...
Projection Pictures,16.0,74.0
London Boulevard,16.0,74.0
Phantom Four,10.0,72.0
Empire Pictures,7.0,62.0


# 26. Какие фильмы входят в 1 процент лучших по рейтингу? 
по vote_average

In [1309]:
best_movies = data[['imdb_id', 'original_title', 'vote_average']].sort_values('vote_average', ascending = False)
quantity = len(best_movies)
one_percent = round(quantity * 0.01)

display(best_movies[:one_percent])

answer(list(best_movies[:one_percent]['original_title'].values), 26)

Unnamed: 0,imdb_id,original_title,vote_average
599,tt0468569,The Dark Knight,8.1
118,tt0816692,Interstellar,8.0
125,tt2084970,The Imitation Game,8.0
9,tt2096673,Inside Out,8.0
34,tt3170832,Room,8.0
1183,tt0993846,The Wolf of Wall Street,7.9
128,tt2267998,Gone Girl,7.9
1191,tt2024544,12 Years a Slave,7.9
119,tt2015381,Guardians of the Galaxy,7.9
1081,tt0167260,The Lord of the Rings: The Return of the King,7.9


# 27. Какие актеры чаще всего снимаются в одном фильме вместе?


In [1310]:
from itertools import combinations

actors = data[['original_title', 'cast']].copy()
actors['cast'] = split(actors['cast'])
actors['combinations'] = actors['cast'].apply(lambda cast: list(combinations(cast,2)))
actors = actors.explode('combinations')

often_together = actors.groupby('combinations').count().sort_values('cast',ascending= False)
the_pairs = list(often_together.query('cast == cast.max()').index)

answer(the_pairs, 27)
answers[27]

[('Daniel Radcliffe', 'Rupert Grint'), ('Daniel Radcliffe', 'Emma Watson')]

# Submission

In [1311]:
# в конце можно посмотреть свои ответы к каждому вопросу
answers

{1: 'Pirates of the Caribbean: On Stranger Tides (tt1298650)',
 2: 'Gods and Generals (tt0279111)',
 3: 'Winnie the Pooh (tt1449283)',
 4: 109.6585494970884,
 5: 107.0,
 6: 'Avatar (tt0499549)',
 7: 'The Lone Ranger (tt1210819)',
 8: 1478,
 9: 'The Dark Knight (tt0468569)',
 10: 'Mutant Chronicles (tt0490181)',
 11: 'Comedy',
 12: 'Drama',
 13: 'Peter Jackson',
 14: 'Robert Rodriguez',
 15: 'Chris Hemsworth',
 16: 'Matt Damon',
 17: 'Action',
 18: 'K-19: The Widowmaker (tt0267626)',
 19: 2015,
 20: 2014,
 21: 'September',
 22: 450,
 23: 'Peter Jackson',
 24: 'Four By Two Productions',
 25: 'Midnight Picture Show',
 26: ['The Dark Knight',
  'Interstellar',
  'The Imitation Game',
  'Inside Out',
  'Room',
  'The Wolf of Wall Street',
  'Gone Girl',
  '12 Years a Slave',
  'Guardians of the Galaxy',
  'The Lord of the Rings: The Return of the King',
  'Memento',
  'Inception',
  'The Pianist',
  'The Grand Budapest Hotel',
  'Her',
  'Spotlight',
  'Big Hero 6',
  'The Fault in Our Star

In [1312]:
# и убедиться что ни чего не пропустил)
len(answers)

27