# 04 - Beginning Data Analysis 

In [1]:
import pandas as pd
import numpy as np
import os

dataPath = os.path.dirname(os.getcwd()) + '\Data'

In [2]:
college = pd.read_csv(dataPath + '/college.csv')
college.head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


## Developing a data analysis routine

In [3]:
college.sample(random_state=42)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3649,Career Point College,San Antonio,TX,0.0,0.0,0.0,0,,,0.0,...,0.0,0.0,0.0,0.0,1,0.9172,0.9172,0.697,20700,14977


In [4]:
college.shape

(7535, 27)

List the data type of each column, the number of non-missing values, and memory usage with the .info method:

In [5]:
college.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   INSTNM              7535 non-null   object 
 1   CITY                7535 non-null   object 
 2   STABBR              7535 non-null   object 
 3   HBCU                7164 non-null   float64
 4   MENONLY             7164 non-null   float64
 5   WOMENONLY           7164 non-null   float64
 6   RELAFFIL            7535 non-null   int64  
 7   SATVRMID            1185 non-null   float64
 8   SATMTMID            1196 non-null   float64
 9   DISTANCEONLY        7164 non-null   float64
 10  UGDS                6874 non-null   float64
 11  UGDS_WHITE          6874 non-null   float64
 12  UGDS_BLACK          6874 non-null   float64
 13  UGDS_HISP           6874 non-null   float64
 14  UGDS_ASIAN          6874 non-null   float64
 15  UGDS_AIAN           6874 non-null   float64
 16  UGDS_N

Get summary statistics for the numerical columns and transpose the DataFrame for more readable output:

In [6]:
(college
 .describe(include = np.number)
 .T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


Get summary statistics for the object (string) columns

In [7]:
(college
 .describe(include = object)
 .T)

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Mount Washington College,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


In [8]:
dict_df = pd.read_csv(dataPath + "/college_data_dictionary.csv")
dict_df

Unnamed: 0,column_name,description
0,INSTNM,Institution Name
1,CITY,City Location
2,STABBR,State Abbreviation
3,HBCU,Historically Black College or University
4,MENONLY,0/1 Men Only
5,WOMENONLY,0/1 Women only
6,RELAFFIL,0/1 Religious Affiliation
7,SATVRMID,SAT Verbal Median
8,SATMTMID,SAT Math Median
9,DISTANCEONLY,Distance Education Only


## Reducing memory by changing data types

In [9]:
different_cols = [
    "RELAFFIL",
    "SATMTMID",
    "CURROPER",
    "INSTNM",
    "STABBR",
]

col2 = college.loc[:,different_cols]
col2.head()

Unnamed: 0,RELAFFIL,SATMTMID,CURROPER,INSTNM,STABBR
0,0,420.0,1,Alabama A & M University,AL
1,0,565.0,1,University of Alabama at Birmingham,AL
2,1,,1,Amridge University,AL
3,0,590.0,1,University of Alabama in Huntsville,AL
4,0,430.0,1,Alabama State University,AL


In [10]:
college.dtypes

INSTNM                 object
CITY                   object
STABBR                 object
HBCU                  float64
MENONLY               float64
WOMENONLY             float64
RELAFFIL                int64
SATVRMID              float64
SATMTMID              float64
DISTANCEONLY          float64
UGDS                  float64
UGDS_WHITE            float64
UGDS_BLACK            float64
UGDS_HISP             float64
UGDS_ASIAN            float64
UGDS_AIAN             float64
UGDS_NHPI             float64
UGDS_2MOR             float64
UGDS_NRA              float64
UGDS_UNKN             float64
PPTUG_EF              float64
CURROPER                int64
PCTPELL               float64
PCTFLOAN              float64
UG25ABV               float64
MD_EARN_WNE_P10        object
GRAD_DEBT_MDN_SUPP     object
dtype: object

In [11]:
original_mem = col2.memory_usage(deep=True)
original_mem

Index          128
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

There is no need to use 64 bits for the RELAFFIL column as it contains only 0 or 1. Let's convert this column to an 8-bit (1 byte) integer with the .astype method:

In [12]:
col2["RELAFFIL"] = col2["RELAFFIL"].astype(np.int8)
col2.dtypes

RELAFFIL       int8
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [13]:
col2.memory_usage(deep=True)

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

To save even more memory, you will want to consider changing object data types to categorical if they have a reasonably low cardinality (number of unique values). Let's first check the number of unique values for both the object columns:

In [14]:
col2.select_dtypes(include=["object"]).nunique()

INSTNM    7535
STABBR      59
dtype: int64

The STABBR column is a good candidate to convert to categorical as less than one percent of its values are unique:

In [15]:
col2["STABBR"] = col2["STABBR"].astype("category")
col2.dtypes

RELAFFIL        int8
SATMTMID     float64
CURROPER       int64
INSTNM        object
STABBR      category
dtype: object

In [16]:
new_mem = col2.memory_usage(deep=True)
new_mem

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660699
STABBR       13120
dtype: int64

Finally, let's compare the original memory usage with our updated memory usage. The RELAFFIL column is, as expected, an eighth of its original size, while the STABBR column has shrunk to just three percent of its original size:

In [17]:
new_mem / original_mem

Index       1.000000
RELAFFIL    0.125000
SATMTMID    1.000000
CURROPER    1.000000
INSTNM      1.000695
STABBR      0.029512
dtype: float64

In [18]:
display(f'Original: {original_mem.sum()}',f'New Memory: {new_mem.sum()}')

'Original: 1285773'

'New Memory: 802042'

### How it works…

pandas defaults integer and float data types to 64 bits regardless of the maximum necessary size for the particular DataFrame. Integers, floats, and even Booleans may be coerced to a different data type with the .astype method and passing it the exact type, either as a string or specific object, as done in step 4.

The RELAFFIL column is a good choice to cast to a smaller integer type as the data dictionary explains that its values must be 0 or 1. The memory for RELAFFIL is now an eighth of CURROPER, which remains as its former type.

Columns that have an object data type, such as INSTNM, are not like the other pandas data types. For all the other pandas data types, each value in that column is the same data type. For instance, when a column has the int64 type, every column value is also int64. This is not true for columns that have the object data type. Each column value can be of any type. They can have a mix of strings, numerics, datetimes, or even other Python objects such as lists or tuples. For this reason, the object data type is sometimes referred to as a catch-all for a column of data that doesn't match any of the other data types. The vast majority of the time, though, object data type columns will all be strings.

Therefore, the memory of each value in an object data type column is inconsistent. There is no predefined amount of memory for each value like the other data types. For pandas to extract the exact amount of memory of an object data type column, the deep parameter must be set to True in the .memory_usage method.

Object columns are targets for the largest memory savings. pandas has an additional categorical data type that is not available in NumPy. When converting to category, pandas internally creates a mapping from integers to each unique string value. Thus, each string only needs to be kept a single time in memory. As you can see, this change of data type reduced memory usage by 97%.

You might also have noticed that the index uses an extremely low amount of memory. If no index is specified during DataFrame creation, as is the case in this recipe, pandas defaults the index to a RangeIndex. The RangeIndex is very similar to the built-in range function. It produces values on demand and only stores the minimum amount of information needed to create an index

## Selecting the smallest of the largest

In [19]:
movies = pd.read_csv(dataPath + '/movie.csv')
movie2 = movies[['movie_title', 'imdb_score', 'budget']]
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's End,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force Awakens,7.1,


In [20]:
(movie2
 .nlargest(100,'imdb_score')
 .head(10))

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0
66,The Dark Knight,9.0,185000000.0
2791,The Godfather: Part II,9.0,13000000.0
3415,Fargo,9.0,
335,The Lord of the Rings: The Return of the King,8.9,94000000.0
1857,Schindler's List,8.9,22000000.0


Chain the .nsmallest method to return the five lowest budget films among those with a top 100 score:

In [21]:
(movie2
 .nsmallest(100,'imdb_score')
 .nsmallest(5, 'budget')
 .head(20))

Unnamed: 0,movie_title,imdb_score,budget
4893,The Ridges,3.0,17350.0
4838,"Dude, Where's My Dog?!",3.2,20000.0
4890,Dry Spell,3.3,22000.0
4874,Raymond Did It,3.2,40000.0
4862,Hayride,3.4,60000.0


### How it works…
The first parameter of the .nlargest method, n, must be an integer and selects the number of rows to be returned. The second parameter, columns, takes a column name as a string. Step 2 returns the 100 highest-scoring movies. We could have saved this intermediate result as its own variable but instead, we chain the .nsmallest method to it in step 3, which returns exactly five rows, sorted by budget.

### There's more…
It is possible to pass a list of column names to the columns parameter of the .nlargest and .nsmallest methods. This would only be useful to break ties in the event that there were duplicate values sharing the nth ranked spot in the first column in the list.

## Selecting the largest of each group by sorting

In [22]:
movie = pd.read_csv(dataPath + "/movie.csv")
cols = ["movie_title", "title_year", "imdb_score"]
movie[cols]

Unnamed: 0,movie_title,title_year,imdb_score
0,Avatar,2009.0,7.9
1,Pirates of the Caribbean: At World's End,2007.0,7.1
2,Spectre,2015.0,6.8
3,The Dark Knight Rises,2012.0,8.5
4,Star Wars: Episode VII - The Force Awakens,,7.1
...,...,...,...
4911,Signed Sealed Delivered,2013.0,7.7
4912,The Following,,7.5
4913,A Plague So Pleasant,2013.0,6.3
4914,Shanghai Calling,2012.0,6.3


In [23]:
(movie[cols]
 .sort_values('title_year', ascending=True)
)

Unnamed: 0,movie_title,title_year,imdb_score
4695,Intolerance: Love's Struggle Throughout the Ages,1916.0,8.0
4833,Over the Hill to the Poorhouse,1920.0,4.8
4767,The Big Parade,1925.0,8.3
2694,Metropolis,1927.0,8.3
4697,The Broadway Melody,1929.0,6.3
...,...,...,...
4683,Heroes,,7.7
4688,Home Movies,,8.2
4704,Revolution,,6.7
4752,Happy Valley,,8.5


Sort multiple columns.

In [24]:
(movie[cols]
 .sort_values(['title_year', 'imdb_score'], ascending=False)
)

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
4277,A Beginner's Guide to Snuff,2016.0,8.7
3798,Airlift,2016.0,8.5
27,Captain America: Civil War,2016.0,8.2
98,Godzilla Resurgence,2016.0,8.2
...,...,...,...
1391,Rush Hour,,5.8
4031,Creature,,5.0
2165,Meet the Browns,,3.5
3246,The Bold and the Beautiful,,3.5


Now, we use the .drop_duplicates method to keep only the first row of every year.

By default, .drop_duplicates keeps the very first appearance of a value, but this behavior may be modified by passing keep='last' to select the last row of each group or keep=False to drop all duplicates entirely.

In [36]:
df = (
  movie[cols]
 .sort_values(['title_year', 'imdb_score'], ascending=False)
 .drop_duplicates(subset='title_year')
)
df

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
3745,Running Forever,2015.0,8.6
4369,Queen of the Mountains,2014.0,8.7
3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5
...,...,...,...
2694,Metropolis,1927.0,8.3
4767,The Big Parade,1925.0,8.3
4833,Over the Hill to the Poorhouse,1920.0,4.8
4695,Intolerance: Love's Struggle Throughout the Ages,1916.0,8.0


### How it works
The default behavior of the .drop_duplicates method is to keep the first occurrence of each unique row, which would not drop any rows as each row is unique. However, the subset parameter alters it to only consider the column (or list of columns) given to it. In this example, only one row for each year will be returned. As we sorted by year and score in the last step, the highest-scoring movie for each year is what we get.

### There's more…
As in most things pandas, there is more than one way to do this. If you find yourself comfortable with grouping operations, you can use the .groupby method to do this as well:

In [37]:
df1 = (movie[["movie_title", "title_year", "imdb_score"]]
 .groupby("title_year", as_index=False)
 .apply(
        lambda df:df.sort_values("imdb_score", ascending=False).head(1)
  )
 .droplevel(1)
 .sort_values("title_year", ascending=False)
)
df1

Unnamed: 0,movie_title,title_year,imdb_score
90,Kickboxer: Vengeance,2016.0,9.1
89,Running Forever,2015.0,8.6
88,Queen of the Mountains,2014.0,8.7
87,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
86,The Dark Knight Rises,2012.0,8.5
...,...,...,...
4,Pandora's Box,1929.0,8.0
3,Metropolis,1927.0,8.3
2,The Big Parade,1925.0,8.3
1,Over the Hill to the Poorhouse,1920.0,4.8
