In [28]:
import pandas as pd
import numpy as np

In [70]:
path2data='../data/drinks.csv'
drinks = pd.read_csv(path2data)
drinks.head()
drinks['beer_servings'].mean()
drinks['continent'].value_counts().sort_index()
drinks[drinks['continent'] == 'EU']
drinks[(drinks['continent'] == 'EU') & (drinks['wine_servings'] > 300)]
drinks.rename(columns={'beer_servings':'beer', 
                       'wine_servings':'wine',
                       'spirit_servings':'spirit'}, inplace=True)
drinks['servings'] = drinks['beer'] + drinks['spirit'] + drinks['wine']
drinks['mL'] = drinks['total_litres_of_pure_alcohol'] * 1000
drinks.drop('mL', axis=1)  
drinks['continent'].isnull().sum()
drinks['continent'].fillna(value='NA', inplace=True) 
drinks.groupby(['continent', 'country'])['beer'].mean()
# Mapping to a new variable.
drinks['cont'] = drinks['continent'].map({'AS':'Asia', 
                         'EU':'Europe', 
                         'AF':'Africa', 
                         'SA':'South America',
                         'NA':'North America'
                        })
# Mapping to a new variable.
drinks['beer_level'] = 'low'    
drinks.loc[drinks['beer'].between(101, 200), 'beer_level'] = 'med'     
drinks.loc[drinks['beer'].between(201, 400), 'beer_level'] = 'high'   
# outout to a csv file.
drinks.to_csv('drinks_updated.csv') 

### Four ways to recode a variable.

In [50]:
imdb_1000_data_url = '../data/imdb_1000.csv'
movies = pd.read_csv(imdb_1000_data_url)

# what are the content ratings?
movies['content_rating'].value_counts()

# filter a variable.
movies.loc[(movies['content_rating']=='X') |
           (movies['content_rating']=='TV-MA'), ['content_rating']]
           
# recode a variable:
movies.loc[(movies['content_rating']=='X') |
           (movies['content_rating']=='TV-MA'),
           'content_rating']="NC-17"

# another way to do this: 
convert_dict = {'X':'NC-17', 'TV-MA':'NC-17', 'R':'R', 'PG-13':'PG-13', 'UNRATED':'UNRATED', 'PG':'PG', 'G':'G'}
movies['content_rating'] = movies['content_rating'].map(convert_dict)

# yet another way:
movies['content_rating'].replace(['X', 'TV-MA'], 'NC-17', inplace=True)

# and yet another way:
movies['content_rating']=movies['content_rating'].map({'X':'NC-17','TV-MA':'NC-17'})

# recoding into a new, binary variable.
movies['new_rating']=np.where(movies['content_rating']=='X', 'NC-17', 'okay')

In [61]:
movies.columns

Index(['star_rating', 'title', 'content_rating', 'genre', 'duration',
       'actors_list'],
      dtype='object')

## How to filter.

In [65]:
imdb_1000_data_url = '../data/imdb_1000.csv'
movies = pd.read_csv(imdb_1000_data_url)

movies.loc[(movies['content_rating']=='X') |
           (movies['content_rating']=='TV-MA'),
           ['content_rating', 'title', 'star_rating']].sort_values('star_rating')      

Unnamed: 0,content_rating,title,star_rating
913,X,Suspiria,7.5
486,X,Evil Dead II,7.8
387,X,Midnight Cowboy,8.0
219,TV-MA,Who's Afraid of Virginia Woolf?,8.1
86,X,A Clockwork Orange,8.4


In [46]:
movies.corr()
movies['duration'].corr(movies['star_rating'])

Unnamed: 0,star_rating,duration
star_rating,1.0,0.227149
duration,0.227149,1.0


In [47]:
movies.loc[movies['actors_list'].str.contains('Marlon Brando')]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list,new_rating
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']",okay
51,8.5,Apocalypse Now,R,Drama,153,"[u'Martin Sheen', u'Marlon Brando', u'Robert D...",okay
122,8.3,On the Waterfront,NOT RATED,Crime,108,"[u'Marlon Brando', u'Karl Malden', u'Lee J. Co...",okay
284,8.1,A Streetcar Named Desire,PG,Drama,122,"[u'Vivien Leigh', u'Marlon Brando', u'Kim Hunt...",okay
