# Pandas Vignette

March 30, 2022

Vignette: Pandas, DataFrames wrangling and filtering

@author: Oscar A. Trevizo

### References
1. "Pandas documentation" (accessed Dec. 20, 2022)
-    https://pandas.pydata.org/docs/
-    https://pandas.pydata.org/pandas-docs/stable/reference/general_functions.html
-    https://pandas.pydata.org/docs/reference/io.html
-    https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html


# Pandas


## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Create a DataFrame

In [2]:
# Get the data
first_name = [" Joan", "Mary ", " Vijay ", "Rob ", "Martha", "Josh", " Vicky", " Mario", "Jenny", "Joe"]
last_name = [" T"," K ", " N ", "R ", "L", "F ", " R", " L", "%^", "P"]
score_1 = [91, 83, 95, 72, 91, 85, 89, 82, 'abc', 79]
score_2 = [91, 85, 90, 81, 95, 92, 88, 94, 'xyz', 75]

# Build the dataframe
df = pd.DataFrame({'first_name':first_name, 'last_name':last_name,'score_1':score_1,'score_2':score_2}  )
df.head(10)

Unnamed: 0,first_name,last_name,score_1,score_2
0,Joan,T,91,91
1,Mary,K,83,85
2,Vijay,N,95,90
3,Rob,R,72,81
4,Martha,L,91,95
5,Josh,F,85,92
6,Vicky,R,89,88
7,Mario,L,82,94
8,Jenny,%^,abc,xyz
9,Joe,P,79,75


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   first_name  10 non-null     object
 1   last_name   10 non-null     object
 2   score_1     10 non-null     object
 3   score_2     10 non-null     object
dtypes: object(4)
memory usage: 448.0+ bytes


## To numeric

In [4]:
df['score_1'] = pd.to_numeric(df['score_1'], errors='coerce')

In [5]:
df['score_2'] = pd.to_numeric(df['score_2'], errors='coerce')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   first_name  10 non-null     object 
 1   last_name   10 non-null     object 
 2   score_1     9 non-null      float64
 3   score_2     9 non-null      float64
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [7]:
df.describe()

Unnamed: 0,score_1,score_2
count,9.0,9.0
mean,85.222222,87.888889
std,7.120003,6.527719
min,72.0,75.0
25%,82.0,85.0
50%,85.0,90.0
75%,91.0,92.0
max,95.0,95.0


## Clean up blank spaces

In [8]:
# Strip right and left spaces
df["first_name"] = df["first_name"].str.strip()
df["last_name"] = df["last_name"].str.strip()
df.head(10)

Unnamed: 0,first_name,last_name,score_1,score_2
0,Joan,T,91.0,91.0
1,Mary,K,83.0,85.0
2,Vijay,N,95.0,90.0
3,Rob,R,72.0,81.0
4,Martha,L,91.0,95.0
5,Josh,F,85.0,92.0
6,Vicky,R,89.0,88.0
7,Mario,L,82.0,94.0
8,Jenny,%^,,
9,Joe,P,79.0,75.0


## Regex clean up characters
http://localhost:8888/notebooks/Python/Jupyter_Vignettes/regex_vignette.ipynb

In [9]:
df["last_name"] = df["last_name"].replace('[^A-Za-z]', np.NaN, regex=True)
df["score_1"] = df["score_1"].replace('[^0-9]', np.NaN, regex=True)
df["score_2"] = df["score_2"].replace('[^0-9]', np.NaN, regex=True)

df.head(10)

Unnamed: 0,first_name,last_name,score_1,score_2
0,Joan,T,91.0,91.0
1,Mary,K,83.0,85.0
2,Vijay,N,95.0,90.0
3,Rob,R,72.0,81.0
4,Martha,L,91.0,95.0
5,Josh,F,85.0,92.0
6,Vicky,R,89.0,88.0
7,Mario,L,82.0,94.0
8,Jenny,,,
9,Joe,P,79.0,75.0


## Drop NaN

In [10]:
df.dropna(inplace=True)
df.head(10)

Unnamed: 0,first_name,last_name,score_1,score_2
0,Joan,T,91.0,91.0
1,Mary,K,83.0,85.0
2,Vijay,N,95.0,90.0
3,Rob,R,72.0,81.0
4,Martha,L,91.0,95.0
5,Josh,F,85.0,92.0
6,Vicky,R,89.0,88.0
7,Mario,L,82.0,94.0
9,Joe,P,79.0,75.0


## Assign data types

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 9
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   first_name  9 non-null      object 
 1   last_name   9 non-null      object 
 2   score_1     9 non-null      float64
 3   score_2     9 non-null      float64
dtypes: float64(2), object(2)
memory usage: 360.0+ bytes


In [12]:
# Based on example from https://www.geeksforgeeks.org/change-data-type-for-one-or-more-columns-in-pandas-dataframe
# Similar reference in https://stackoverflow.com/questions/49684951/pandas-read-csv-dtype-read-all-columns-but-few-as-string
dtypes_dict = {'first_name' : str,
               'last_name' : str,
               'score_1' : int,
               'score_2' : int}
df = df.astype(dtypes_dict)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 9
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   first_name  9 non-null      object
 1   last_name   9 non-null      object
 2   score_1     9 non-null      int32 
 3   score_2     9 non-null      int32 
dtypes: int32(2), object(2)
memory usage: 288.0+ bytes


# Pandas statistics methods

In [14]:
df.score_1.mean()

85.22222222222223

## Add column / List comprehension

In [15]:
df['s1_a'] = ['A' if col >= 90 else 'B' if col >= 80 else 'C' if col >= 70 else 'D' if col >= 60 else 'F' for col in df['score_1']]
df

Unnamed: 0,first_name,last_name,score_1,score_2,s1_a
0,Joan,T,91,91,A
1,Mary,K,83,85,B
2,Vijay,N,95,90,A
3,Rob,R,72,81,C
4,Martha,L,91,95,A
5,Josh,F,85,92,B
6,Vicky,R,89,88,B
7,Mario,L,82,94,B
9,Joe,P,79,75,C


In [16]:
df['s2_a'] = ['A' if col >= 90 else 'B' if col >= 80 else 'C' if col >= 70 else 'D' if col >= 60 else 'F' for col in df['score_2']]
df

Unnamed: 0,first_name,last_name,score_1,score_2,s1_a,s2_a
0,Joan,T,91,91,A,A
1,Mary,K,83,85,B,B
2,Vijay,N,95,90,A,A
3,Rob,R,72,81,C,B
4,Martha,L,91,95,A,A
5,Josh,F,85,92,B,A
6,Vicky,R,89,88,B,B
7,Mario,L,82,94,B,A
9,Joe,P,79,75,C,C


In [17]:
df['improved'] = ['yes' if col2 > col1 else 'no' for col1, col2 in zip(df['score_1'], df['score_2'])]
df

Unnamed: 0,first_name,last_name,score_1,score_2,s1_a,s2_a,improved
0,Joan,T,91,91,A,A,no
1,Mary,K,83,85,B,B,yes
2,Vijay,N,95,90,A,A,no
3,Rob,R,72,81,C,B,yes
4,Martha,L,91,95,A,A,yes
5,Josh,F,85,92,B,A,yes
6,Vicky,R,89,88,B,B,no
7,Mario,L,82,94,B,A,yes
9,Joe,P,79,75,C,C,no


##  Filters

In [18]:
# The filter itself
df["score_1"] == df.score_1.max()

0    False
1    False
2     True
3    False
4    False
5    False
6    False
7    False
9    False
Name: score_1, dtype: bool

In [19]:
# The filter applied, and to certain columns only
df[['first_name', 'last_name','score_1']][df["score_1"] == df.score_1.max()]

Unnamed: 0,first_name,last_name,score_1
2,Vijay,N,95


In [20]:
# Another filter, applied, for practice
df[['first_name', 'last_name','score_1', 'score_2']][(df["score_1"] > 90) & (df['score_2'] > 90) ]

Unnamed: 0,first_name,last_name,score_1,score_2
0,Joan,T,91,91
4,Martha,L,91,95
