In [6]:
import pandas as pd
from scipy.stats import ttest_ind

Playing around with Tableau, I discovered that films with the word "zombie" in the title were rated highed on IMBd than those without it. Here I will perform an independent two sample ttest for significance. 

In [7]:
df = pd.read_csv('zombie_movies.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Title Type,IMDb Rating,Runtime (mins),Year,Genres,Num Votes,Release Date,Directors,...,Crime,Romance,War,Fantasy,Animation,Musical,Family,Western,Title_Dead,Title_Zombie
0,247,Das Cabinet des Dr. Caligari,movie,8.0,76.0,1920,"Horror, Mystery, Thriller",65778,1920-02-26,Robert Wiene,...,False,False,False,False,False,False,False,False,False,False
1,85,White Zombie,movie,6.2,69.0,1932,Horror,10982,1932-07-28,Victor Halperin,...,False,False,False,False,False,False,False,False,False,True
2,48,The Walking Dead,movie,6.6,66.0,1936,"Crime, Drama, Horror, Sci-Fi",2789,1936-03-01,Michael Curtiz,...,True,False,False,False,False,False,False,False,True,False
3,86,Revolt of the Zombies,movie,3.4,65.0,1936,"Adventure, Horror",1779,1936-06-04,Victor Halperin,...,False,False,False,False,False,False,False,False,False,True
4,87,King of the Zombies,movie,5.2,67.0,1941,"Adventure, Comedy, Horror",2340,1941-05-14,Jean Yarbrough,...,False,False,False,False,False,False,False,False,False,True


In [8]:
#define samples
Z_true = df[df['Title_Zombie']==True]
Z_false = df[df['Title_Zombie']==False]

#t-test
ttest_ind(Z_true['IMDb Rating'], Z_false['IMDb Rating'])

Ttest_indResult(statistic=-3.9392213820358135, pvalue=0.00010720897118688328)

In [9]:
#define samples
Z_true = df[df['Title_Zombie']==True]
Z_false = df[df['Title_Zombie']==False]

#Welch's t-test, so drop the assumption of equal variance between the groups.
ttest_ind(Z_true['IMDb Rating'], Z_false['IMDb Rating'], equal_var = False)

Ttest_indResult(statistic=-3.3787672588879754, pvalue=0.0021652150999688455)

The results are significant at p < 0.001 in the regular independent samples ttest, and in Welch's ttest they are still significant at the p < 0.01 level>. 

From here I will proceed to run tests like this on the various genres.

In [33]:
#creating function to streamline the process going forward
def zttest(col_name):
    #define samples
    Z_true = df[col_name==True]
    Z_false = df[col_name==False]
    print("Results of independent samples test:")
    print(ttest_ind(Z_true['IMDb Rating'], Z_false['IMDb Rating']))
    print("Results of Welch's test:")
    print(ttest_ind(Z_true['IMDb Rating'], Z_false['IMDb Rating'], equal_var=False))
    return
    

In [34]:
zttest(df['Action'])

Results of independent samples test:
Ttest_indResult(statistic=-1.4333568583127096, pvalue=0.15305730093698186)
Results of Welch's test:
Ttest_indResult(statistic=-1.3324706293860544, pvalue=0.1862310992625)


The results are not significant for the Action genre.

In [20]:
df.columns

Index(['Unnamed: 0', 'Title', 'Title Type', 'IMDb Rating', 'Runtime (mins)',
       'Year', 'Genres', 'Num Votes', 'Release Date', 'Directors', 'Horror',
       'Sci-Fi', 'Drama', 'Thriller', 'Comedy', 'Adventure', 'Action', 'Adult',
       'Mystery', 'Crime', 'Romance', 'War', 'Fantasy', 'Animation', 'Musical',
       'Family', 'Western', 'Title_Dead', 'Title_Zombie'],
      dtype='object')

In [35]:
zttest(df['Horror'])

Results of independent samples test:
Ttest_indResult(statistic=-0.5569568581794282, pvalue=0.578075911714347)
Results of Welch's test:
Ttest_indResult(statistic=-0.4930555630270584, pvalue=0.6363176662287826)


In [36]:
zttest(df['Sci-Fi'])

Results of independent samples test:
Ttest_indResult(statistic=-0.5317833166753737, pvalue=0.595367963357024)
Results of Welch's test:
Ttest_indResult(statistic=-0.5410661018219812, pvalue=0.5890222441784569)


In [37]:
zttest(df['Drama'])

Results of independent samples test:
Ttest_indResult(statistic=1.3110353301199626, pvalue=0.1910988355841852)
Results of Welch's test:
Ttest_indResult(statistic=1.4710112479189885, pvalue=0.14857387292738394)


In [38]:
zttest(df['Thriller'])

Results of independent samples test:
Ttest_indResult(statistic=1.9110960884408403, pvalue=0.057183570738536194)
Results of Welch's test:
Ttest_indResult(statistic=1.9337674550057689, pvalue=0.05529269669344954)


In [39]:
zttest(df['Comedy'])

Results of independent samples test:
Ttest_indResult(statistic=1.2934964479755398, pvalue=0.19708295498136574)
Results of Welch's test:
Ttest_indResult(statistic=1.3835379096733165, pvalue=0.16862055501766904)


In [40]:
zttest(df['Adventure'])

Results of independent samples test:
Ttest_indResult(statistic=1.0734618658493917, pvalue=0.28414205930430675)
Results of Welch's test:
Ttest_indResult(statistic=1.064732018117567, pvalue=0.29764452168538125)


In [41]:
zttest(df['Mystery'])

Results of independent samples test:
Ttest_indResult(statistic=2.7355147002434905, pvalue=0.0066933971948343165)
Results of Welch's test:
Ttest_indResult(statistic=3.007051495507354, pvalue=0.006818503945862065)


Significant results for mystery at p < 0.01. I will check the value counts before moving on.

In [42]:
df['Mystery'].value_counts()

False    224
True      18
Name: Mystery, dtype: int64

In [43]:
zttest(df['Crime'])

Results of independent samples test:
Ttest_indResult(statistic=1.2206459053342795, pvalue=0.2234180228132845)
Results of Welch's test:
Ttest_indResult(statistic=1.3193861644769156, pvalue=0.2255157989588943)


In [44]:
df.columns

Index(['Unnamed: 0', 'Title', 'Title Type', 'IMDb Rating', 'Runtime (mins)',
       'Year', 'Genres', 'Num Votes', 'Release Date', 'Directors', 'Horror',
       'Sci-Fi', 'Drama', 'Thriller', 'Comedy', 'Adventure', 'Action', 'Adult',
       'Mystery', 'Crime', 'Romance', 'War', 'Fantasy', 'Animation', 'Musical',
       'Family', 'Western', 'Title_Dead', 'Title_Zombie'],
      dtype='object')

In [45]:
zttest(df['Romance'])

Results of independent samples test:
Ttest_indResult(statistic=1.4176055225970774, pvalue=0.15760292929053396)
Results of Welch's test:
Ttest_indResult(statistic=1.5352704357283475, pvalue=0.18198826766642692)


In [46]:
zttest(df['War'])

Results of independent samples test:
Ttest_indResult(statistic=-1.2704763875241218, pvalue=0.20514519434135278)
Results of Welch's test:
Ttest_indResult(statistic=-0.7417775442323461, pvalue=0.5931360232551979)


In [47]:
zttest(df['Fantasy'])

Results of independent samples test:
Ttest_indResult(statistic=1.6955861226694362, pvalue=0.09126110887894114)
Results of Welch's test:
Ttest_indResult(statistic=1.7511501129216782, pvalue=0.08953328622963393)


In [48]:
zttest(df['Animation'])

Results of independent samples test:
Ttest_indResult(statistic=0.1562655699356391, pvalue=0.8759550579762747)
Results of Welch's test:
Ttest_indResult(statistic=0.07587935525145027, pvalue=0.9517501188126114)


In [49]:
zttest(df['Musical'])

Results of independent samples test:
Ttest_indResult(statistic=1.3363524132145101, pvalue=0.1826996953800357)
Results of Welch's test:
Ttest_indResult(statistic=1.8641349562512284, pvalue=0.1972153502706195)


In [51]:
zttest(df['Western'])

Results of independent samples test:
Ttest_indResult(statistic=-0.3174884486271553, pvalue=0.7511490422921452)
Results of Welch's test:
Ttest_indResult(statistic=-0.23083333336771641, pvalue=0.8386612086857858)


In [52]:
zttest(df['Title_Dead'])

Results of independent samples test:
Ttest_indResult(statistic=-0.05767469409782866, pvalue=0.9540557408051231)
Results of Welch's test:
Ttest_indResult(statistic=-0.05416186370346874, pvalue=0.9569817072755918)


In [53]:
df['Family'].value_counts()

False    241
True       1
Name: Family, dtype: int64

Family and Adult films (above and below, respectively), did not have enough in the True category for the ttest to run.

In [54]:
df['Adult'].value_counts()

False    241
True       1
Name: Adult, dtype: int64