In [282]:
import pandas as pd
import numpy as np

In [283]:
data = pd.read_csv('movie_metadata.csv')

In [284]:
print(data.columns)
set([data[col].dtype for col in data.columns])

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')


{dtype('int64'), dtype('float64'), dtype('O')}

Fill every NaN value with the mean of the column.

In [285]:
for col in data.select_dtypes(['int', 'float']).columns:
    mean = data[col].mean()
    data[col].fillna(mean, inplace=True)

Remove duplicate rows

In [286]:
data = data.drop_duplicates()

Add new column `profit`

In [287]:
data['profit'] = data['gross'] - data['budget']

Remove spaces from and lower the titles

In [288]:
data['movie_title'] = data['movie_title'].apply(lambda x: x.replace(' ', '').lower())
data['movie_title']

0                                    avatar 
1        piratesofthecaribbean:atworld'send 
2                                   spectre 
3                        thedarkknightrises 
4       starwars:episodevii-theforceawakens 
                        ...                 
5038                  signedsealeddelivered 
5039                           thefollowing 
5040                      aplaguesopleasant 
5041                        shanghaicalling 
5042                         mydatewithdrew 
Name: movie_title, Length: 4998, dtype: object

Sort by `imdb_score` in descending order 

In [289]:
data.sort_values(by='imdb_score', ascending=False)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,profit
2765,Color,John Blanchard,140.194272,65.0,0.000000,176.0,Andrea Martin,770.0,4.846841e+07,Comedy,...,English,Canada,,3.975262e+07,2002.470517,179.0,9.5,1.330000,0,8.715787e+06
1937,Color,Frank Darabont,199.000000,142.0,0.000000,461.0,Jeffrey DeMunn,11000.0,2.834147e+07,Crime|Drama,...,English,USA,R,2.500000e+07,1994.000000,745.0,9.3,1.850000,108000,3.341469e+06
3466,Color,Francis Ford Coppola,208.000000,175.0,0.000000,3000.0,Marlon Brando,14000.0,1.348220e+08,Crime|Drama,...,English,USA,R,6.000000e+06,1972.000000,10000.0,9.2,1.850000,43000,1.288220e+08
3207,Color,,53.000000,55.0,686.509212,2.0,Olaf Lubaszenko,20.0,4.470930e+05,Drama,...,Polish,Poland,TV-MA,3.975262e+07,2002.470517,3.0,9.1,1.330000,0,-3.930553e+07
2824,Color,,53.000000,55.0,686.509212,2.0,Olaf Lubaszenko,20.0,4.470930e+05,Drama,...,Polish,Poland,TV-MA,3.975262e+07,2002.470517,3.0,9.1,1.330000,0,-3.930553e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4605,Color,A. Raven Cruz,3.000000,97.0,0.000000,94.0,Vanilla Ice,639.0,4.846841e+07,Action|Adventure|Comedy|Fantasy|Sci-Fi,...,English,USA,R,1.000000e+06,2005.000000,361.0,1.9,1.780000,128,4.746841e+07
2295,Color,Bob Clark,32.000000,88.0,84.000000,177.0,Vanessa Angel,650.0,9.109322e+06,Comedy|Family|Sci-Fi,...,English,Germany,PG,2.000000e+07,2004.000000,384.0,1.9,2.350000,0,-1.089068e+07
2268,Color,Jason Friedberg,111.000000,88.0,82.000000,329.0,Tony Cox,869.0,1.417465e+07,Comedy,...,English,USA,PG-13,2.500000e+07,2008.000000,624.0,1.9,1.850000,0,-1.082535e+07
1136,Color,Lawrence Kasanoff,12.000000,91.0,11.000000,500.0,Larry Miller,719.0,4.846841e+07,Action|Animation|Comedy|Family|Fantasy,...,English,USA,PG,6.500000e+07,2012.000000,611.0,1.7,2.220403,0,-1.653159e+07


Substitute null language with the most spoken one

In [290]:
most_lang = data.value_counts('language', ascending=False).idxmax(0)
data['language'] = data['language'].fillna(most_lang)

7、创建一个新列 successful_movie，如果 imdb_score 大于等于 7.5 且 profit 大于 0，则为 True，否则为 False。

In [291]:
data['successful_movie'] = (data.imdb_score > 7.5) & (data.profit > 0)

8、将电影数据集中的 genres 列拆分为多个列，每个列代表一个类型，使用 One-Hot 编码。

In [292]:
oh = pd.get_dummies(data['genres'].str.split('|', expand=True).stack(dropna=True))
oh = oh.groupby(level=0,axis=0).max()
oh

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5039,0,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0
5040,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
5041,0,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [293]:
data = data.merge(oh, left_index=True, right_index=True)
data

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,Color,James Cameron,723.000000,178.000000,0.000000,855.000000,Joel David Moore,1000.0,7.605058e+08,Action|Adventure|Fantasy|Sci-Fi,...,0,0,0,0,1,0,0,0,0,0
1,Color,Gore Verbinski,302.000000,169.000000,563.000000,1000.000000,Orlando Bloom,40000.0,3.094042e+08,Action|Adventure|Fantasy,...,0,0,0,0,0,0,0,0,0,0
2,Color,Sam Mendes,602.000000,148.000000,0.000000,161.000000,Rory Kinnear,11000.0,2.000742e+08,Action|Adventure|Thriller,...,0,0,0,0,0,0,0,1,0,0
3,Color,Christopher Nolan,813.000000,164.000000,22000.000000,23000.000000,Christian Bale,27000.0,4.481306e+08,Action|Thriller,...,0,0,0,0,0,0,0,1,0,0
4,,Doug Walker,140.194272,107.201074,131.000000,645.009761,Rob Walker,131.0,4.846841e+07,Documentary,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Color,Scott Smith,1.000000,87.000000,2.000000,318.000000,Daphne Zuniga,637.0,4.846841e+07,Comedy|Drama,...,0,0,0,0,0,0,0,0,0,0
5039,Color,,43.000000,43.000000,686.509212,319.000000,Valorie Curry,841.0,4.846841e+07,Crime|Drama|Mystery|Thriller,...,1,0,0,0,0,0,0,1,0,0
5040,Color,Benjamin Roberds,13.000000,76.000000,0.000000,0.000000,Maxwell Moody,0.0,4.846841e+07,Drama|Horror|Thriller,...,0,0,0,0,0,0,0,1,0,0
5041,Color,Daniel Hsia,14.000000,100.000000,0.000000,489.000000,Daniel Henney,946.0,1.044300e+04,Comedy|Drama|Romance,...,0,0,0,1,0,0,0,0,0,0


9、计算每个导演的平均评分，将结果添加为新列 avg_director_score。

In [294]:
direc_score = data.groupby('director_name')['imdb_score'].mean().reset_index().rename(columns={'imdb_score': 'avg_director_score'})
direc_score

Unnamed: 0,director_name,avg_director_score
0,A. Raven Cruz,1.9
1,Aaron Hann,6.0
2,Aaron Schneider,7.1
3,Aaron Seltzer,2.7
4,Abel Ferrara,6.6
...,...,...
2393,Zoran Lisinac,7.1
2394,Álex de la Iglesia,6.1
2395,Émile Gaudreault,6.7
2396,Éric Tessier,6.6


In [295]:
data = data.join(direc_score.set_index('director_name'), on='director_name')
data

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Thriller,War,Western,avg_director_score
0,Color,James Cameron,723.000000,178.000000,0.000000,855.000000,Joel David Moore,1000.0,7.605058e+08,Action|Adventure|Fantasy|Sci-Fi,...,0,0,0,1,0,0,0,0,0,7.914286
1,Color,Gore Verbinski,302.000000,169.000000,563.000000,1000.000000,Orlando Bloom,40000.0,3.094042e+08,Action|Adventure|Fantasy,...,0,0,0,0,0,0,0,0,0,6.985714
2,Color,Sam Mendes,602.000000,148.000000,0.000000,161.000000,Rory Kinnear,11000.0,2.000742e+08,Action|Adventure|Thriller,...,0,0,0,0,0,0,1,0,0,7.500000
3,Color,Christopher Nolan,813.000000,164.000000,22000.000000,23000.000000,Christian Bale,27000.0,4.481306e+08,Action|Thriller,...,0,0,0,0,0,0,1,0,0,8.425000
4,,Doug Walker,140.194272,107.201074,131.000000,645.009761,Rob Walker,131.0,4.846841e+07,Documentary,...,0,0,0,0,0,0,0,0,0,7.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Color,Scott Smith,1.000000,87.000000,2.000000,318.000000,Daphne Zuniga,637.0,4.846841e+07,Comedy|Drama,...,0,0,0,0,0,0,0,0,0,7.700000
5039,Color,,43.000000,43.000000,686.509212,319.000000,Valorie Curry,841.0,4.846841e+07,Crime|Drama|Mystery|Thriller,...,0,0,0,0,0,0,1,0,0,
5040,Color,Benjamin Roberds,13.000000,76.000000,0.000000,0.000000,Maxwell Moody,0.0,4.846841e+07,Drama|Horror|Thriller,...,0,0,0,0,0,0,1,0,0,6.300000
5041,Color,Daniel Hsia,14.000000,100.000000,0.000000,489.000000,Daniel Henney,946.0,1.044300e+04,Comedy|Drama|Romance,...,0,0,1,0,0,0,0,0,0,6.300000


对电影数据集中的 title_year 列进行分桶，创建一个新列 year_group，表示年份所属的年代

In [296]:
years = pd.cut(data['title_year'], bins=[1910 + k * 10 for k in range(13)], labels=[1910 + k * 10 for k in range(12)]).reset_index()
years = years.rename(columns={'title_year': 'year_group'})['year_group']
years

0       2000
1       2000
2       2010
3       2010
4       2000
        ... 
4993    2010
4994    2000
4995    2010
4996    2010
4997    2000
Name: year_group, Length: 4998, dtype: category
Categories (12, int64): [1910 < 1920 < 1930 < 1940 ... 1990 < 2000 < 2010 < 2020]

In [297]:
data = data.merge(years, left_index=True, right_index=True)
data

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,Reality-TV,Romance,Sci-Fi,Short,Sport,Thriller,War,Western,avg_director_score,year_group
0,Color,James Cameron,723.000000,178.000000,0.0,855.000000,Joel David Moore,1000.0,7.605058e+08,Action|Adventure|Fantasy|Sci-Fi,...,0,0,1,0,0,0,0,0,7.914286,2000
1,Color,Gore Verbinski,302.000000,169.000000,563.0,1000.000000,Orlando Bloom,40000.0,3.094042e+08,Action|Adventure|Fantasy,...,0,0,0,0,0,0,0,0,6.985714,2000
2,Color,Sam Mendes,602.000000,148.000000,0.0,161.000000,Rory Kinnear,11000.0,2.000742e+08,Action|Adventure|Thriller,...,0,0,0,0,0,1,0,0,7.500000,2010
3,Color,Christopher Nolan,813.000000,164.000000,22000.0,23000.000000,Christian Bale,27000.0,4.481306e+08,Action|Thriller,...,0,0,0,0,0,1,0,0,8.425000,2010
4,,Doug Walker,140.194272,107.201074,131.0,645.009761,Rob Walker,131.0,4.846841e+07,Documentary,...,0,0,0,0,0,0,0,0,7.100000,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4993,Color,William Eubank,161.000000,97.000000,18.0,236.000000,Olivia Cooke,852.0,4.846841e+07,Sci-Fi|Thriller,...,0,0,1,0,0,1,0,0,6.100000,2010
4994,Color,Patrick Meaney,7.000000,81.000000,3.0,18.000000,Greg Aronowitz,26.0,4.846841e+07,Biography|Documentary,...,0,0,0,0,0,0,0,0,7.400000,2000
4995,Color,Chad Hartigan,34.000000,83.000000,3.0,69.000000,Paul Eenhoorn,695.0,4.846841e+07,Drama,...,0,0,0,0,0,0,0,0,6.600000,2010
4996,Color,Malcolm Goodwin,140.194272,96.000000,117.0,281.000000,Jon Gries,948.0,4.846841e+07,Comedy,...,0,0,0,0,0,0,0,0,5.500000,2010


In [298]:
data['title_year'].unique()

array([2009.        , 2007.        , 2015.        , 2012.        ,
       2002.47051672, 2010.        , 2016.        , 2006.        ,
       2008.        , 2013.        , 2011.        , 2014.        ,
       2005.        , 1997.        , 2004.        , 1999.        ,
       1995.        , 2003.        , 2001.        , 2002.        ,
       1998.        , 2000.        , 1990.        , 1991.        ,
       1994.        , 1996.        , 1982.        , 1993.        ,
       1979.        , 1992.        , 1989.        , 1984.        ,
       1988.        , 1978.        , 1962.        , 1980.        ,
       1972.        , 1981.        , 1968.        , 1985.        ,
       1940.        , 1963.        , 1987.        , 1986.        ,
       1973.        , 1983.        , 1976.        , 1977.        ,
       1970.        , 1971.        , 1969.        , 1960.        ,
       1965.        , 1964.        , 1927.        , 1974.        ,
       1937.        , 1975.        , 1967.        , 1951.     

# Part 2

1. 使用 NumPy 计算电影数据集中 duration 列的均值，忽略缺失值。
2. 找出电影数据集中 budget 列的异常值，使用 3 倍标准差法。
3. 使用 NumPy 计算每个电影的 profit，并添加到数据集中。
4. 使用 NumPy 将电影标题（movie_title）的字符全部转换为大写。
5. 找出电影数据集中 imdb_score 的 25% 分位数和 75% 分位数，然后将小于 25% 分位数和大于 75% 分位数的值替换为中位数。
6. 使用 NumPy 计算电影数据集中 num_voted_users 列的总和。
7. 找出电影数据集中 language 列中最常见的语言。
8. 使用 NumPy 计算电影数据集中 actor_1_facebook_likes 列的中位数，忽略缺失值。
9. 使用 NumPy 计算电影数据集中 num_user_for_reviews 列的平均值，忽略缺失值。
10. 使用 NumPy 计算电影数据集中 movie_facebook_likes 列的标准差，忽略缺失值

In [301]:
data = pd.read_csv('movie_metadata.csv')

In [312]:
# Q1
np.mean(data['duration'])
np.mean(data['duration'][data['duration'].notna()])

107.2010739856802

In [321]:
# Q2
budgets = data['budget'][data['budget'].notna()]
mean, std = np.mean(budgets), np.std(budgets)
budgets[(budgets > mean + 3 * std) | (budgets < mean - 3 * std)]

2323    2.400000e+09
2334    2.127520e+09
2988    1.221550e+10
3005    2.500000e+09
3075    7.000000e+08
3423    1.100000e+09
3851    7.000000e+08
3859    4.200000e+09
4542    1.000000e+09
Name: budget, dtype: float64

In [334]:
# Q3
idx = data['budget'].notna() & data['gross'].notna()
profit = np.array(data[idx]['budget'].array)
data.loc[idx, 'profit'] = profit
data

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,profit
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,237000000.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,300000000.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,245000000.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,250000000.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,English,Canada,,,2013.0,470.0,7.7,,84,
5039,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,English,USA,TV-14,,,593.0,7.5,16.00,32000,
5040,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,English,USA,,1400.0,2013.0,0.0,6.3,,16,
5041,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660,


In [343]:
# Q4
data['movie_title'] = np.vectorize(lambda x: x.upper())(np.array(data['movie_title']))
data

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,profit
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,237000000.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,300000000.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,245000000.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,250000000.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,English,Canada,,,2013.0,470.0,7.7,,84,
5039,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,English,USA,TV-14,,,593.0,7.5,16.00,32000,
5040,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,English,USA,,1400.0,2013.0,0.0,6.3,,16,
5041,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660,


In [360]:
# Q5
idx = data['imdb_score'].notna()
scores = np.array(data['imdb_score'][idx])
scores2 = np.vectorize(lambda x: np.median(scores) if x > np.quantile(scores, q=0.75) or x < np.quantile(scores, q=0.25) else x)(scores)
data.loc[idx, 'imdb_score'] = scores2
scores2

array([6.6, 7.1, 6.8, ..., 6.3, 6.3, 6.6])

In [364]:
# Q6
np.sum(data['num_voted_users'])

421938535

In [378]:
# Q7
counts = {}
for l in np.array(data['language']):
    if l not in counts:
        counts[l] = 0
    counts[l] += 1
sorted(list(counts.keys()), key=lambda x: counts[x], reverse=True)[0]

'English'

In [380]:
# Q8
data['actor_1_facebook_likes'].median()

988.0

In [381]:
# Q9
data['num_user_for_reviews'].mean()

272.77080844285143

In [382]:
# Q10
data['movie_facebook_likes'].std()

19320.445109946588

1. 计算电影数据集中 imdb_score 列的平均值、中位数、标准差和范围。
2. 找出电影数据集中评分最高的电影的标题和导演。
3. 计算每个影片类型的数量，并将结果按降序排列。
4. 计算电影数据集中每个年代（year_group）的平均 imdb_score。
5. 找出电影数据集中投票人数最多的电影的标题和导演。
6. 算电影数据集中每个语言（language）的平均 gross。
7. 找出电影数据集中主演1（actor_1_name）Facebook 点赞数最高的电影的标题和导演。
8. 计算电影数据集中每个国家（country）的平均 imdb_score。
9. 找出电影数据集中投票人数与收入（gross）之间的相关系数。
10. 计算电影数据集中不同分级（content_rating）的电影数量，并将结果按升序排列

In [383]:
data = pd.read_csv('movie_metadata.csv')

In [384]:
# Q1
data['imdb_score'].describe()

count    5043.000000
mean        6.442138
std         1.125116
min         1.600000
25%         5.800000
50%         6.600000
75%         7.200000
max         9.500000
Name: imdb_score, dtype: float64

In [398]:
# Q2
data.sort_values('imdb_score', ascending=False).head(1)[['movie_title', 'director_name']]

Unnamed: 0,movie_title,director_name
2765,Towering Inferno,John Blanchard


In [414]:
# Q3
pd.get_dummies(data['genres'].str.split('|', expand=True).stack()).groupby(level=0).sum().sum(axis=0).sort_values(ascending=False)

Drama          2594
Comedy         1872
Thriller       1411
Action         1153
Romance        1107
Adventure       923
Crime           889
Sci-Fi          616
Fantasy         610
Horror          565
Family          546
Mystery         500
Biography       293
Animation       242
Music           214
War             213
History         207
Sport           182
Musical         132
Documentary     121
Western          97
Film-Noir         6
Short             5
News              3
Reality-TV        2
Game-Show         1
dtype: int64

In [422]:
# Q4
years = pd.cut(data['title_year'], bins=[1910 + k * 10 for k in range(13)], labels=[1910 + k * 10 for k in range(12)]).reset_index()
years = years.rename(columns={'title_year': 'year_group'})['year_group']
data.merge(years, left_index=True, right_index=True)[['year_group', 'imdb_score']].groupby('year_group').mean('imdb_score')[:-1] # 2020 NaN

Unnamed: 0_level_0,imdb_score
year_group,Unnamed: 1_level_1
1910,6.4
1920,7.74
1930,7.610526
1940,7.428571
1950,7.55
1960,7.336585
1970,7.102381
1980,6.63569
1990,6.460949
2000,6.355118


In [426]:
# Q5
data.sort_values(by='num_voted_users', ascending=False).head(1)[['movie_title', 'director_name']]

Unnamed: 0,movie_title,director_name
1937,The Shawshank Redemption,Frank Darabont


In [428]:
# Q6
data[['language', 'gross']].groupby('language').mean('gross')

Unnamed: 0_level_0,gross
language,Unnamed: 1_level_1
Aboriginal,39340390.0
Arabic,840915.5
Aramaic,499263.0
Bosnian,301305.0
Cantonese,6429425.0
Chinese,50000.0
Czech,617228.0
Danish,801285.7
Dari,8462619.0
Dutch,1884888.0


In [430]:
# Q7
data.sort_values('actor_1_facebook_likes', ascending=False).head(1)[['movie_title', 'director_name']]

Unnamed: 0,movie_title,director_name
1902,Anchorman: The Legend of Ron Burgundy,Adam McKay


In [429]:
data.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [433]:
# Q8
data[['country', 'imdb_score']].sort_values(by='country', ascending=False).groupby('country').mean('imdb_score')

Unnamed: 0_level_0,imdb_score
country,Unnamed: 1_level_1
Afghanistan,7.400000
Argentina,7.500000
Aruba,4.800000
Australia,6.514545
Bahamas,4.400000
...,...
Turkey,6.000000
UK,6.818304
USA,6.367428
United Arab Emirates,8.200000


In [434]:
# Q9
subdata = data[data['num_voted_users'].notna() & data['gross'].notna()]
np.corrcoef(np.array(data['num_voted_users'], data['gross']))

1.0

In [453]:
# 10
data['content_rating'].value_counts().sort_values(ascending=True)

TV-Y7           1
TV-Y            1
M               5
GP              6
NC-17           7
Passed          9
TV-G           10
X              13
TV-PG          13
TV-MA          20
TV-14          30
Approved       55
Unrated        62
G             112
Not Rated     116
PG            701
PG-13        1461
R            2118
Name: content_rating, dtype: int64