In [230]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from datetime import datetime
from itertools import combinations

In [231]:
data = pd.read_csv('movie_bd_v5.csv')
data.sample(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
91,tt2938956,25000000,72629670,The Transporter Refueled,Ed Skrein|Ray Stevenson|Loan Chabanol|Gabriell...,Camille Delamarre,He Delivers.,The fast-paced action movie is again set in th...,96,Thriller|Action|Crime,EuropaCorp,9/3/2015,5.3,2015
272,tt0362478,30000000,33333531,The Box,Cameron Diaz|James Marsden|Frank Langella|Mich...,Richard Kelly,All you have to do is push the button.,"Norma and Arthur Lewis, a suburban couple with...",115,Thriller|Science Fiction,Media Rights Capital|Lin Pictures|Warner Bros....,9/17/2009,5.3,2009
1666,tt0359517,12000000,31179516,Johnson Family Vacation,Cedric the Entertainer|Vanessa Williams|Steve ...,Christopher Erskin,Take the ride.,AAA can't help the roadside emergency that is ...,97,Comedy|Family,Fox Searchlight Pictures|Bird and a Bear Enter...,4/7/2004,5.3,2004
536,tt0218922,42000000,35402320,Original Sin,Angelina Jolie|Antonio Banderas|Gregory Itzin|...,Michael Cristofer,This is not a love story - it's a story about ...,A young man is plunged into a life of subterfu...,118,Drama|Thriller|Mystery|Romance,Metro-Goldwyn-Mayer (MGM),8/3/2001,5.8,2001
424,tt1438254,44000000,48190704,Charlie St. Cloud,Zac Efron|Amanda Crew|Kim Basinger|Chris Masso...,Burr Steers,Life is for living,Accomplished sailor Charlie St. Cloud has the ...,99,Drama,Universal Pictures|Marc Platt Productions|Rela...,5/1/2010,6.8,2010


In [232]:
# get some info on data types and possible indication on something that
# requires to be preprocessed
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1889 entries, 0 to 1888
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               1889 non-null   object 
 1   budget                1889 non-null   int64  
 2   revenue               1889 non-null   int64  
 3   original_title        1889 non-null   object 
 4   cast                  1889 non-null   object 
 5   director              1889 non-null   object 
 6   tagline               1889 non-null   object 
 7   overview              1889 non-null   object 
 8   runtime               1889 non-null   int64  
 9   genres                1889 non-null   object 
 10  production_companies  1889 non-null   object 
 11  release_date          1889 non-null   object 
 12  vote_average          1889 non-null   float64
 13  release_year          1889 non-null   int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 206.7+ KB


In [233]:
data.describe()

Unnamed: 0,budget,revenue,runtime,vote_average,release_year
count,1889.0,1889.0,1889.0,1889.0,1889.0
mean,54310830.0,155365300.0,109.658549,6.140762,2007.860773
std,48587210.0,214669800.0,18.017041,0.764763,4.468841
min,5000000.0,2033165.0,63.0,3.3,2000.0
25%,20000000.0,34560580.0,97.0,5.6,2004.0
50%,38000000.0,83615410.0,107.0,6.1,2008.0
75%,72000000.0,178262600.0,120.0,6.6,2012.0
max,380000000.0,2781506000.0,214.0,8.1,2015.0


# Предобработка

In [234]:
answers = {} # создадим словарь для ответов

# тут другие ваши предобработки колонок например:

#the time given in the dataset is in string format.
#So we need to change this in datetime format
# ...

# profit (i.e. revenue - budget) is something that we use in few cases and
# it would be convenient to have it as a column, let's do this
data['profit'] = data['revenue'] - data['budget']


# a few questions are dealing with the month of release date, adding this for
# convenience as a separate column
data['release_month'] = data.release_date.apply(
    lambda dt: datetime.strptime(dt, '%m/%d/%Y').strftime('%B'))


# Util func for alternative solution in Q11-12
def get_genres_counter(genres):
    '''
    Generates collections.Counter from iterable collection of strings containing
    genres. Values containing multiple genres separated by '|' are accepted
    e.g. 'Action|Drama|Thriller'

    Returns
    -------
    collections.Counter
        Counter of genres located
    '''
    # Preferred using explicit looping adding counters 'manually' to 
    # initializing Counter with iterable list of genres (which probably looks 
    # nicer, but my concern was memory usage to generate quite long list of 
    # string)    
    genres_counter = Counter()    
    for val in genres:
        for genre in val.split('|'):
            genres_counter[genre] += 1
    return genres_counter


# DF with mulitple actors in cast column separated into rows to work with 
# questions related to statistic on actors (Q15-16)
data_cast = data.copy()
data_cast = data_cast.assign(cast=data_cast.cast.str.split('|')).explode('cast')

# 1. У какого фильма из списка самый большой бюджет?

Использовать варианты ответов в коде решения запрещено.    
Вы думаете и в жизни у вас будут варианты ответов?)

In [235]:
# в словарь вставляем номер вопроса и ваш ответ на него
# Пример:
answers['1'] = '2. Spider-Man 3 (tt0413300)'
# запишите свой вариант ответа
# если ответили верно, можете добавить комментарий со значком "+"

In [236]:
# Q1 solution
# Using budget column, based on info provided by data.info() all 1889
# values are valid int64 (no need to preprocess), can use aggregator
# Filter DF leaving records where budget equals max budget in given DF to get
# the answer
answer_data = data[data.budget == data.budget.max()]
# display data
display(answer_data)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month
723,tt1298650,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,Johnny Depp|PenÃ©lope Cruz|Geoffrey Rush|Ian M...,Rob Marshall,Live Forever Or Die Trying.,Captain Jack Sparrow crosses paths with a woma...,136,Adventure|Action|Fantasy,Walt Disney Pictures|Jerry Bruckheimer Films|M...,5/11/2011,6.3,2011,641683000,May


ВАРИАНТ 2

In [237]:
# Q1 alternative solution
# Could probably query DF as well but it's not as pretty as above IMO
answer_data = data.query(f'budget=={data.budget.max()}')
display(answer_data)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month
723,tt1298650,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,Johnny Depp|PenÃ©lope Cruz|Geoffrey Rush|Ian M...,Rob Marshall,Live Forever Or Die Trying.,Captain Jack Sparrow crosses paths with a woma...,136,Adventure|Action|Fantasy,Walt Disney Pictures|Jerry Bruckheimer Films|M...,5/11/2011,6.3,2011,641683000,May


In [238]:
# Q1 answer
answers['1'] = '5. Pirates of the Caribbean: On Stranger Tides (tt1298650)' # +

# 2. Какой из фильмов самый длительный (в минутах)?

In [239]:
# Q2 solution
# Use runtime column, based on info provided by data.info() all 1889
# values the column are valid int64 (no need to preprocess), can use aggregator
# Filter DF leaving record(s) where runtime is equal to max value of runtime in
# DF
answer_data = data[data.runtime == data.runtime.max()]
# display data
display(answer_data)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month
1157,tt0279111,56000000,12923936,Gods and Generals,Stephen Lang|Jeff Daniels|Robert Duvall|Kevin ...,Ronald F. Maxwell,The nations heart was touched by...,The film centers mostly around the personal an...,214,Drama|History|War,Turner Pictures|Antietam Filmworks,2/21/2003,5.8,2003,-43076064,February


In [240]:
# Q2 answer
answers['2'] = '2. Gods and Generals (tt0279111)' # +

# 3. Какой из фильмов самый короткий (в минутах)?





In [241]:
# Q3 solution
# Same as Q2, use runtime column. Filter DF leaving record(s) where runtime is
# equal to min value of runtime in DF
answer_data = data[data.runtime == data.runtime.min()]
# display data
display(answer_data)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month
768,tt1449283,30000000,14460000,Winnie the Pooh,Jim Cummings|Travis Oates|Jim Cummings|Bud Luc...,Stephen Anderson|Don Hall,Oh Pooh.,"During an ordinary day in Hundred Acre Wood, W...",63,Animation|Family,Walt Disney Pictures|Walt Disney Animation Stu...,4/13/2011,6.8,2011,-15540000,April


In [242]:
# Q3 answer
answers['3'] = '3. Winnie the Pooh (tt1449283)' # +

# 4. Какова средняя длительность фильмов?


In [243]:
# Q4 solution
# Simply use aggregator method to get average runtime of the movies in DF
round(data.runtime.mean())

110

ВАРИАНТ 2

In [244]:
# Q4 alternative solution
# As an alternative, DF returned by data.describe() could be used to get same
# information
round(data.describe()['runtime']['mean'])

110

In [245]:
# Q4 answer
answers['4'] = '2. 110' # +

# 5. Каково медианное значение длительности фильмов? 

In [246]:
# Q5 solution
# Simply use aggregator method to get median runtime of the movies in DF
round(data.runtime.median())

107

In [247]:
# Q5 answer
answers['5'] = '1. 107' # +

# 6. Какой самый прибыльный фильм?
#### Внимание! Здесь и далее под «прибылью» или «убытками» понимается разность между сборами и бюджетом фильма. (прибыль = сборы - бюджет) в нашем датасете это будет (profit = revenue - budget) 

In [248]:
# Q6 solution
# Based on profit column (i.e. max profit) that was enriched DF with at
# preprocessing stage
answer_data = data[data.profit == data.profit.max()]
display(answer_data)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month
239,tt0499549,237000000,2781505847,Avatar,Sam Worthington|Zoe Saldana|Sigourney Weaver|S...,James Cameron,Enter the World of Pandora.,"In the 22nd century, a paraplegic Marine is di...",162,Action|Adventure|Fantasy|Science Fiction,Ingenious Film Partners|Twentieth Century Fox ...,12/10/2009,7.1,2009,2544505847,December


In [249]:
# Q6 answer
answers['6'] = '5. Avatar (tt0499549)' # +

# 7. Какой фильм самый убыточный? 

In [250]:
# Q7 solution
# Based on profit column (i.e. min profit) that was enriched DF with at
# preprocessing stage
answer_data = data[data.profit == data.profit.min()]
display(answer_data)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month
1245,tt1210819,255000000,89289910,The Lone Ranger,Johnny Depp|Armie Hammer|William Fichtner|Hele...,Gore Verbinski,Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,Action|Adventure|Western,Walt Disney Pictures|Jerry Bruckheimer Films|I...,7/3/2013,6.0,2013,-165710090,July


In [251]:
# Q7 answer
answers['7'] = '5. The Lone Ranger (tt1210819)' # +

# 8. У скольких фильмов из датасета объем сборов оказался выше бюджета?

In [252]:
# Q8 solution
# In this case I think I'd prefer querying 'profit > 0' which seems to be more
# readable (and claimed to be more efficient depending on size and type of DF?
# - didn't have time to confirm that).
answer_data = data.query('profit>0')
len(answer_data)

1478

ВАРИАНТ 2

In [253]:
# Q8 alternative solution
# Straight forward approach
answer_data = data[data.revenue > data.budget]
len(answer_data)

1478

In [254]:
# Q8 answer
answers['8'] = '1. 1478' # +

# 9. Какой фильм оказался самым кассовым в 2008 году?

In [255]:
# Q9 solution
# First get a temp DF containing movies released in 2008, then get the DF based
# on revenue
data2008 = data[data.release_year == 2008]
answer_data = data2008[data2008.revenue == data2008.revenue.max()]
answer_data

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month
599,tt0468569,185000000,1001921825,The Dark Knight,Christian Bale|Michael Caine|Heath Ledger|Aaro...,Christopher Nolan,Why So Serious?,Batman raises the stakes in his war on crime. ...,152,Drama|Action|Crime|Thriller,DC Comics|Legendary Pictures|Warner Bros.|Syncopy,7/16/2008,8.1,2008,816921825,July


In [256]:
# Q9 answer
answers['9'] = '4. The Dark Knight (tt0468569)' # +

# 10. Самый убыточный фильм за период с 2012 по 2014 г. (включительно)?


In [257]:
# Q10 solution
# First get a temp DF containing movies released between 2012 and 2014, then get
# movies where profit equals min profit in the set for period
data_period = data.query('2012<=release_year<=2014')
answer_data = data_period[data_period.profit == data_period.profit.min()]
answer_data

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month
1245,tt1210819,255000000,89289910,The Lone Ranger,Johnny Depp|Armie Hammer|William Fichtner|Hele...,Gore Verbinski,Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,Action|Adventure|Western,Walt Disney Pictures|Jerry Bruckheimer Films|I...,7/3/2013,6.0,2013,-165710090,July


In [258]:
# Q10 answer
answers['10'] = '5. The Lone Ranger (tt1210819)' # +

# 11. Какого жанра фильмов больше всего?

In [259]:
# Q11 solution
# Generate Series from data.genres exploding split values (so there's only one
# genre per row), then count occurences of genres and get the one that has the
# highest count
data.genres.str.split('|').explode().value_counts(
    sort=True, ascending=False).head(1)

Drama    782
Name: genres, dtype: int64

ВАРИАНТ 2

In [260]:
# Q11 alternative solution
# Use of collections.Counter seems as a good alternative for the job
# Given similar functionality (i.e. genres counting) is required in the next
# question, the code initializing Counter has been separated into a function
genres = get_genres_counter(data.genres)
genres.most_common(1)

[('Drama', 782)]

In [261]:
# Q11 answer
answers['11'] = '3. Drama' # +

# 12. Фильмы какого жанра чаще всего становятся прибыльными? 

In [262]:
# Q12 solution
# Similar to the solution for previous question, but filter in only movies that
# are profitable first and generate Series on that
data[data.profit > 0].genres.str.split('|').explode().value_counts(
    sort=True, ascending=False).head(1)

Drama    560
Name: genres, dtype: int64

ВАРИАНТ 2

In [263]:
# Q12 alternative solution
# Similar approach in Q11. Filter in profitable movies and count genres,
# most common entry would be an answer to the question
profitable_genres = get_genres_counter(data[data.profit > 0].genres)
profitable_genres.most_common(1)

[('Drama', 560)]

In [264]:
# Q12 answer
answers['12'] = '1. Drama' # +

# 13. У какого режиссера самые большие суммарные кассовые сборы?

In [265]:
# Q13 solution
# Group DF by director calculating revenue sum, the top row in the sorted DF
# would be the answer to this question
director_grouped = data.groupby(['director']).revenue.sum()
director_grouped.sort_values(ascending=False).head(1)

director
Peter Jackson    6490593685
Name: revenue, dtype: int64

In [266]:
# Q13 answer
answers['13'] = '5. Peter Jackson' # +

# 14. Какой режисер снял больше всего фильмов в стиле Action?

In [267]:
# Q14 solution
# Filter in action movies first then count entries for directors. Due to one
# movie could have multiple directors (separated by '|'), we need to generate
# Series where multiple directors get their own row
action_directors = (data[data.genres.str.contains('Action')]
                    .director.str.split('|').explode())
action_directors.value_counts().head(1)

Robert Rodriguez    9
Name: director, dtype: int64

In [268]:
# Q14 answer
answers['14'] = '3. Robert Rodriguez' # +

# 15. Фильмы с каким актером принесли самые высокие кассовые сборы в 2012 году? 

In [269]:
# Q15 solution
# Initially was planning to create dataset with 2012 movies first then expand
# multiple actors into separate rows, but since DF with actors is required in
# questions below to, created a dedicated data_cast DF to work with actor
# related questions. So..
# Using prepared data_cast DF, get movies released in 2012, group by actors and
# get revenue sum
answer_data = (data_cast[data_cast.release_year == 2012].groupby(['cast'])
              .revenue.sum().sort_values(ascending=False))
answer_data.head(1)

cast
Chris Hemsworth    2027450773
Name: revenue, dtype: int64

In [270]:
# Q15 answer
answers['15'] = '3. Chris Hemsworth' # +

# 16. Какой актер снялся в большем количестве высокобюджетных фильмов?

In [271]:
# Q16 solution
# Using prepared data_cast DF (single actor per row), get high budget movies
# (budget > average budget), group by cast and count imdb_id. The actor with
# the highest amount would be the answer to this question
answer_data = (data_cast[data_cast.budget > data_cast.budget.mean()])
answer_data.groupby(['cast']).imdb_id.nunique().sort_values(
    ascending=False).head(1)

cast
Matt Damon    18
Name: imdb_id, dtype: int64

In [272]:
# Q16 answer
answers['16'] = '3. Matt Damon' # +

# 17. В фильмах какого жанра больше всего снимался Nicolas Cage? 

In [273]:
# Q17 solution
# Since data_cast already exists we can use it to easily get movies starred
# Mr. Cage. The only thing left is to explode multiple genres into separate
# rows and count them
answer_data = (data_cast[data_cast.cast == 'Nicolas Cage'])
answer_data.genres.str.split('|').explode().value_counts().head(1)

Action    17
Name: genres, dtype: int64

In [274]:
# Q17 answer
answers['17'] = '2. Action' # +

# 18. Самый убыточный фильм от Paramount Pictures

In [275]:
# Q18 solution
# Get Paramount Pictures movies using DF prepared earlier, sort values so the
# least profitable movie comes first
data[data.production_companies.str.contains('Paramount Pictures')]\
    .sort_values('profit', ascending=True).head(1)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month
925,tt0267626,100000000,35168966,K-19: The Widowmaker,Harrison Ford|Liam Neeson|Peter Sarsgaard|Joss...,Kathryn Bigelow,Fate has found its hero.,When Russia's first nuclear submarine malfunct...,138,Thriller|Drama|History,Paramount Pictures|Intermedia Films|National G...,7/19/2002,6.0,2002,-64831034,July


In [276]:
# Q18 answer
answers['18'] = '1. K-19: The Widowmaker (tt0267626)' # +

# 19. Какой год стал самым успешным по суммарным кассовым сборам?

In [277]:
# Q19 solution
# Group by release year computing revenue sum for every year
data.groupby('release_year').revenue.sum().sort_values(ascending=False).head(1)

release_year
2015    25449202382
Name: revenue, dtype: int64

In [278]:
# Q19 answer
answers['19'] = '5. 2015' # +

# 20. Какой самый прибыльный год для студии Warner Bros?

In [279]:
# Q20 solution
# Filter in movies that were produced by Warner Brosers and group by release
# year computing sum of profit values
data[data.production_companies.str.contains('Warner Bros')]\
    .groupby('release_year').profit.sum().sort_values(ascending=False).head(1)

release_year
2014    2295464519
Name: profit, dtype: int64

In [280]:
# Q20 answer
answers['20'] = '1. 2014' # +

# 21. В каком месяце за все годы суммарно вышло больше всего фильмов?

In [281]:
# Q21 solution
# Pepared data by adding 'release_month' column generated from 'release_date'.
# Group by month getting number of movies produced, sorting so the month with
# the highest number of movie releases comes first
data.groupby(['release_month']).imdb_id.count().sort_values(
    ascending=False).index[0]

'September'

In [282]:
# Q21 answer
answers['21'] = '4. Сентябрь' # +

# 22. Сколько суммарно вышло фильмов летом? (за июнь, июль, август)

In [283]:
# Q22 solution
# Using query to get movies released in the summer months
data.query('release_month in ["June","July","August"]').imdb_id.count()

450

In [284]:
# Q22 answer
answers['22'] = '2. 450' # +

# 23. Для какого режиссера зима – самое продуктивное время года? 

In [285]:
# Q23 solution
# Filter in movies released in winter months first, then explode list of
# directors and count entries
data.query('release_month in ["December", "January", "February"]')\
    .director.str.split('|').explode().value_counts().head(1)

Peter Jackson    7
Name: director, dtype: int64

In [286]:
# Q23 answer
answers['23'] = '5. Peter Jackson' # +

# 24. Какая студия дает самые длинные названия своим фильмам по количеству символов?

In [287]:
# Q24 solution
# Create a copy of DS and make sure every production company has a separate
# row. Adding helper column containing title lengths in characters. Then group
# by companies getting average number of title length. Sort to get the answer at
# the top
studio_data = data.copy()
studio_data = studio_data.assign(
    production_companies=studio_data.production_companies.str.split('|'))\
    .explode('production_companies')
studio_data['title_len'] = studio_data.original_title.apply(
    lambda s: len(s))
studio_data.groupby(['production_companies']).title_len.mean().sort_values(
    ascending=False).index[0]

'Four By Two Productions'

In [288]:
# Q24 answer
answers['24'] = '5. Four By Two Productions' # +

# 25. Описание фильмов какой студии в среднем самые длинные по количеству слов?

In [289]:
# Q25 solution
# Similar to previous question, but create a helper column containing word count
# in overview column.
studio_data = data.copy()
studio_data = studio_data.assign(
    production_companies=studio_data.production_companies.str.split('|'))\
    .explode('production_companies')
studio_data['overview_words'] = studio_data.overview.apply(
    lambda s: len(s.split()))
studio_data.groupby(['production_companies']).overview_words.mean().sort_values(
    ascending=False).index[0]


'Midnight Picture Show'

In [290]:
# Q25 answer
answers['25'] = '3. Midnight Picture Show' # +

# 26. Какие фильмы входят в 1 процент лучших по рейтингу? 
по vote_average

In [291]:
# Q26 solution
# Getting number of movies that would make 1 percent then sort movies by
# 'vote_average' column and slicing '1 percent' from sorted data
binsize = round(data.imdb_id.nunique() * 0.01)
data.sort_values('vote_average', ascending=False)\
    .original_title[:binsize].to_list()


['The Dark Knight',
 'Interstellar',
 'The Imitation Game',
 'Inside Out',
 'Room',
 'The Wolf of Wall Street',
 'Gone Girl',
 '12 Years a Slave',
 'Guardians of the Galaxy',
 'The Lord of the Rings: The Return of the King',
 'Memento',
 'Inception',
 'The Pianist',
 'The Grand Budapest Hotel',
 'Her',
 'Spotlight',
 'Big Hero 6',
 'The Fault in Our Stars',
 'The Lord of the Rings: The Two Towers']

In [292]:
# Q26 answer
answers['26'] = '1. Inside Out, The Dark Knight, 12 Years a Slave' # +

# 27. Какие актеры чаще всего снимаются в одном фильме вместе?


In [299]:
# Q27 solution
# Construct list of pairs using itertools.combinations. Construct Series to
# count occurrencies of pairs. 
pairs = []
for cast in data.cast.str.split('|'):
    pairs.extend(combinations(cast, 2))
pd.Series(pairs).value_counts(ascending=False).index[0]

('Daniel Radcliffe', 'Rupert Grint')

ВАРИАНТ 2

In [294]:
# Q27 alternative solution
# Similar to the above, but count pairs with Counter 'manually'.
pair_counter = Counter()
for cast in data.cast.str.split('|'):
    for pair in list(combinations(cast, 2)):
        if pair in pair_counter:
            pair_counter[pair] += 1
        else:
            pair_counter[pair] = 1
pair_counter.most_common(1)[0][0]

('Daniel Radcliffe', 'Rupert Grint')

In [295]:
# Q27 answer
answers['27'] = '5. Daniel Radcliffe & Rupert Grint' # +

# Submission

In [296]:
# в конце можно посмотреть свои ответы к каждому вопросу
answers

{'1': '5. Pirates of the Caribbean: On Stranger Tides (tt1298650)',
 '2': '2. Gods and Generals (tt0279111)',
 '3': '3. Winnie the Pooh (tt1449283)',
 '4': '2. 110',
 '5': '1. 107',
 '6': '5. Avatar (tt0499549)',
 '7': '5. The Lone Ranger (tt1210819)',
 '8': '1. 1478',
 '9': '4. The Dark Knight (tt0468569)',
 '10': '5. The Lone Ranger (tt1210819)',
 '11': '3. Drama',
 '12': '1. Drama',
 '13': '5. Peter Jackson',
 '14': '3. Robert Rodriguez',
 '15': '3. Chris Hemsworth',
 '16': '3. Matt Damon',
 '17': '2. Action',
 '18': '1. K-19: The Widowmaker (tt0267626)',
 '19': '5. 2015',
 '20': '1. 2014',
 '21': '4. Сентябрь',
 '22': '2. 450',
 '23': '5. Peter Jackson',
 '24': '5. Four By Two Productions',
 '25': '3. Midnight Picture Show',
 '26': '1. Inside Out, The Dark Knight, 12 Years a Slave',
 '27': '5. Daniel Radcliffe & Rupert Grint'}

In [297]:
# и убедиться что ни чего не пропустил)
len(answers)

27