In [3]:
import pandas as pd
import numpy as np
import json

def mainchar1(castlist) :
    try : 
        return json.loads(castlist)[0]['character'].split()[0]
    except :
        return np.nan
    
def mainchar2(castlist) :
    try : 
        return json.loads(castlist)[1]['character'].split()[0]
    except :
        return np.nan

movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")
m=movies.merge(credits.drop("title",axis=1), right_on="movie_id", left_on= "id")[["title","release_date","cast"]]
m['char1'] = m.cast.apply(mainchar1)
m['char2'] = m.cast.apply(mainchar2)
m.release_date = pd.to_datetime(m.release_date).apply(lambda x : x.year)
m.head()

Unnamed: 0,title,release_date,cast,char1,char2
0,Avatar,2009.0,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...",Jake,Neytiri
1,Pirates of the Caribbean: At World's End,2007.0,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...",Captain,Will
2,Spectre,2015.0,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...",James,Blofeld
3,The Dark Knight Rises,2012.0,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...",Bruce,Alfred
4,John Carter,2012.0,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...",John,Dejah


In [48]:
import os
import matplotlib.pyplot as plt
%matplotlib inline

def add_prop(group):
    group['prop'] = round(100.0*group.births / group.births.sum(), 4)
    return group

class BabyNames(object):
    def __init__(self, fileloc):
        self.fileloc = fileloc
        
        files = [x for x in os.listdir(self.fileloc) if 'pdf' not in x and x != '.DS_Store']
        newlist = []
        columns = ['state','sex','year','name','births']
        for file in files:
            path = '{}/{}'.format(self.fileloc, file)
            df = pd.read_csv(path,names=columns)
            newlist.append(df)
        self.data = pd.concat(newlist,ignore_index = True)
        
        self.states_list = [file[:2] for file in files]
        self.years_list = self.data.year.unique()
        
        def add_prop(group):
            group['prop'] = round(100.0*group.births / group.births.sum(), 4)
            return group
        self.grouped_data =self.data.groupby(["name","year"],as_index=False).sum().groupby('year').apply(add_prop)
        
    def get(self) :
        return self.data
     
    
    def Count(self, state='', year=''):
        state = self.states_list if state == '' else [state]
        year = self.years_list if year == '' else [year]
        return self.data[(self.data["state"].isin(state)) & 
                  (self.data["year"].isin(year))]["births"].sum()
    
    
    def get_top10(group):
        return group.sort_values(by='births',ascending=False).head(10)
    
    def Top10BabyNames(self, state='', year=''):
        state = self.states_list if state == '' else [state]
        year = self.years_list if year == '' else [year]
        df = self.data[self.data["state"].isin(state) & 
                self.data["year"].isin(year)][["name", "births", "sex"]
                                             ].groupby(["name", "sex"], as_index = False
                                                      ).sum().groupby("sex", as_index = False
                                                                     ).apply(get_top10)[["name","sex"]]
        df["Rank"] = list(range(1,11))*2
        df = df.pivot(index = "Rank", columns = "sex", values = "name").reset_index()
        result = pd.DataFrame({"Rank" : df.Rank, "Male" : df.M, "Female" : df.F})[["Rank", "Male", "Female"]]
        return result
    
    def ChangeOfPopularity(self, fromYear=2014, toYear=2015, top=10):
        def get_top(group):
            return group.sort_values(by=toYear,ascending=False).head(top)
        def add_prop(group):
            group['prop'] = round(100.0*group.births / group.births.sum(), 4)
            return group
        
        df = self.data[(self.data["year"] == fromYear) | (self.data["year"] == toYear)
                      ].groupby(["name", "year"], as_index = False).sum().groupby("year", 
                                                                                  as_index = False).apply(add_prop)
        df = df.pivot(index = "name", columns = "year", values = "prop").fillna(0).reset_index()

        df["delta"] = df[toYear] - df[fromYear]
        df["sign"] = "neg" 
        df.loc[(df.delta>0),"sign"] = "pos"
        df.loc[(df.delta==0),"sign"] = "zero"
        df = df.groupby("sign").apply(get_top)[["sign", "name", toYear]].reset_index(drop = True)
        df["Rank"] = list(range(1,top+1))*3

        df = df.pivot(index = "Rank", columns="sign", values = "name")
        result = pd.DataFrame({"c1" : df.pos, "c2" : df.neg, "c3" : df.zero}
                             ).rename(columns={"c1": "More popular", "c2" : "Less popular", "c3" : "Same popularity"})
        return result
    
    
    def Top5NamesPerYear(self,year=2015,sex=''):
        sex = ["M", "F"] if sex == '' else [sex]
        year = self.years_list if year == '' else [year]
        
        cols = ["State"]
        for x in range(1,6):
            cols.append("Rank {}".format(x))
            cols.append("Num")
        top5df = pd.DataFrame(columns=cols, index=range(51))

        for i in range(len(self.states_list)):
            df = self.data[(self.data["year"].isin(year)) & (self.data["sex"].isin(sex)) & 
                          (self.data["state"]==self.states_list[i])
                          ].sort_values(by="births", ascending = False).head(5).reset_index(drop = True)

            top5df.iloc[i,0] = us_states[self.states_list[i]]
            for x in range(5):
                top5df.iloc[i,x*2+1] = df.iloc[x,3]
                top5df.iloc[i,x*2+2] = df.iloc[x,4]
        return top5df
    
    def NamePopularityPlot(self, name='Jim', yearRange=(2000,2015), state='all', sex='both'):
        def add_prop(group):
            group['prop'] = round(100.0*group.births / group.births.sum(), 4)
            return group
        df =self.data
        if state != "all" :
            df = df[df.state==state]
        if sex != "both" :
            df = df[df.sex==sex]
        df = df.groupby(["name","year"],as_index=False).sum()
        df = df.groupby('year').apply(add_prop)
        df = df[(df["year"]>=yearRange[0]) & (df["year"]<=yearRange[1]) & (df["name"] == name)]
        t = "Popularity of name {} from {} to {}".format(name, yearRange[0], yearRange[1])
        df.set_index("year")["prop"].plot(title=t,figsize=(12,6))
        
    def NameJump(self, name="Bill", year=2000, state='all',sex='both') :
        def add_prop(group):
            group['prop'] = round(100.0*group.births / group.births.sum(), 4)
            return group
        if state != "all" or sex != "both" :
            df =self.data
            if state != "all" :
                df = df[df.state==state]
            if sex != "both" :
                df = df[df.sex==sex]
            df = df.groupby(["name","year"],as_index=False).sum()
            df = df.groupby('year').apply(add_prop)
        else :
            df = self.grouped_data
        df = df[((df["year"]==year) | (df["year"]==year+1)) & (df["name"] == name)]
        return df.iloc[1].prop - df.iloc[0].prop
        
    def NameFlip(self, n=10):
        dfy = self.data.groupby(['name', 'year', 'sex'], as_index = False).sum().pivot_table(values = "births", columns = "sex", index = ["name", "year"]).fillna(0).reset_index()
        dfy["delta"] = dfy.F - dfy.M
        dfy["tot"] = dfy.F + dfy.M

        df = dfy[['name', 'delta', 'tot']].groupby("name").agg(["max", "min", "sum"]).reset_index()
        df["check"] = 0
        df.loc[(df["delta", "max"] > 0) & (df["delta", "min"] < 0), "check"] = 1

        result = pd.merge(pd.DataFrame(df[df.check == 1].sort_values([("tot", "sum")], ascending = False).head(n)["name"]),
                          dfy[["name", "year", "delta"]], left_on='name', right_on='name' , how='inner', sort=False)

        result = result.pivot(index = "year", columns = "name", values = "delta").fillna(0)
        t = "Top {} names that flipped over the years".format(n)
        result.plot(title=t,figsize=(12,6))

In [49]:
s=BabyNames("namesbystate")

In [127]:
us_states = {
 'AK': 'Alaska',
 'AL': 'Alabama',
 'AR': 'Arkansas',
 'AZ': 'Arizona',
 'CA': 'California',
 'CO': 'Colorado',
 'CT': 'Connecticut',
 'DC': 'District of Columbia',
 'DE': 'Delaware',
 'FL': 'Florida',
 'GA': 'Georgia',
 'HI': 'Hawaii',
 'IA': 'Iowa',
 'ID': 'Idaho',
 'IL': 'Illinois',
 'IN': 'Indiana',
 'KS': 'Kansas',
 'KY': 'Kentucky',
 'LA': 'Louisiana',
 'MA': 'Massachusetts',
 'MD': 'Maryland',
 'ME': 'Maine',
 'MI': 'Michigan',
 'MN': 'Minnesota',
 'MO': 'Missouri',
 'MS': 'Mississippi',
 'MT': 'Montana',
 'NC': 'North Carolina',
 'ND': 'North Dakota',
 'NE': 'Nebraska',
 'NH': 'New Hampshire',
 'NJ': 'New Jersey',
 'NM': 'New Mexico',
 'NV': 'Nevada',
 'NY': 'New York',
 'OH': 'Ohio',
 'OK': 'Oklahoma',
 'OR': 'Oregon',
 'PA': 'Pennsylvania',
 'RI': 'Rhode Island',
 'SC': 'South Carolina',
 'SD': 'South Dakota',
 'TN': 'Tennessee',
 'TX': 'Texas',
 'UT': 'Utah',
 'VA': 'Virginia',
 'VT': 'Vermont',
 'WA': 'Washington',
 'WI': 'Wisconsin',
 'WV': 'West Virginia',
 'WY': 'Wyoming'
}

In [11]:
def MovieJump(name, year, babynames) :
    try :
        return babynames.NameJump(name=name, year=year)
    except :
        return np.nan

m['char1_jump'] = np.vectorize(MovieJump)(m.char1,m.release_date, s)
m['char2_jump'] = np.vectorize(MovieJump)(m.char2,m.release_date, s)
m

Unnamed: 0,title,release_date,cast,char1,char2,char1_jump
0,Avatar,2009.0,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...",Jake,Neytiri,0.0055
1,Pirates of the Caribbean: At World's End,2007.0,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...",Captain,Will,
2,Spectre,2015.0,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...",James,Blofeld,0.0074
3,The Dark Knight Rises,2012.0,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...",Bruce,Alfred,0.0013
4,John Carter,2012.0,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...",John,Dejah,0.0033


In [44]:
m.sort_values('char1_jump', ascending=False).iloc[:10]

Unnamed: 0,title,release_date,cast,char1,char2,char1_jump,char2_jump
4595,The Evil Dead,1981.0,"[{""cast_id"": 16, ""character"": ""Ashley 'Ash' J....",Ashley,Cheryl,0.1821,-0.0088
3908,Torn Curtain,1966.0,"[{""cast_id"": 2, ""character"": ""Michael Armstron...",Michael,Sarah,0.1424,0.0064
1337,Twilight,2008.0,"[{""cast_id"": 4, ""character"": ""Isabella 'Bella'...",Isabella,Edward,0.1322,0.0088
446,Con Air,1997.0,"[{""cast_id"": 1, ""character"": ""Cameron Poe"", ""c...",Cameron,Marshal,0.1186,0.0
2586,Firestarter,1984.0,"[{""cast_id"": 1, ""character"": ""Andrew 'Andy' Mc...",Andrew,Charlene,0.1153,-0.0015
2387,Day of the Dead,1985.0,"[{""cast_id"": 1, ""character"": ""Sarah"", ""credit_...",Sarah,John,0.1064,-0.0354
1045,The Princess Diaries 2: Royal Engagement,2004.0,"[{""cast_id"": 1, ""character"": ""Mia Thermopolis""...",Mia,Queen,0.1036,-0.0004
3873,Class of 1984,1982.0,"[{""cast_id"": 1, ""character"": ""Andrew Norris"", ...",Andrew,Diane,0.0962,0.0014
213,Mission: Impossible II,2000.0,"[{""cast_id"": 31, ""character"": ""Ethan Hunt"", ""c...",Ethan,Sean,0.0906,0.0
1564,Legends of the Fall,1994.0,"[{""cast_id"": 17, ""character"": ""Tristan Ludlow""...",Tristan,Col.,0.0873,


In [45]:
m.sort_values('char2_jump', ascending=False).iloc[:10]

Unnamed: 0,title,release_date,cast,char1,char2,char1_jump,char2_jump
4144,The Lady from Shanghai,1947.0,"[{""cast_id"": 4, ""character"": ""Elsa Bannister"",...",Elsa,Michael,0.0002,0.1295
3439,The Terminator,1984.0,"[{""cast_id"": 26, ""character"": ""The Terminator""...",The,Kyle,,0.1163
511,X-Men,2000.0,"[{""cast_id"": 5, ""character"": ""Charles Xavier /...",Charles,Logan,-0.0043,0.0975
3764,Topaz,1969.0,"[{""cast_id"": 1, ""character"": ""Andr\u00e9 Dever...",André,Nicole,,0.0963
3531,High Plains Drifter,1973.0,"[{""cast_id"": 1, ""character"": ""The Stranger"", ""...",The,Sarah,,0.0911
1135,Lord of War,2005.0,"[{""cast_id"": 1, ""character"": ""Yuri Orlov"", ""cr...",Yuri,Ava,0.0009,0.0881
2628,Blood and Chocolate,2007.0,"[{""cast_id"": 31, ""character"": ""Vivian"", ""credi...",Vivian,Aiden,0.0012,0.0817
371,The Saint,1997.0,"[{""cast_id"": 13, ""character"": ""Simon Templar"",...",Simon,Emma,0.0047,0.0808
2927,Cruel Intentions,1999.0,"[{""cast_id"": 1, ""character"": ""Kathryn Merteuil...",Kathryn,Sebastian,-0.0005,0.079
1346,Reindeer Games,2000.0,"[{""cast_id"": 17, ""character"": ""Rudy Duncan"", ""...",Rudy,Gabriel,0.0008,0.0712
