In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%config InlineBackend.figure_format = 'retina'

# Conversion to DataFrame

In [5]:
# xml
students_df = pd.read_xml('/Users/nestor/Documents/Fall Module 1/Communications for Analytics/msds610-eda-pandas/students.xml')
students_df.head()

Unnamed: 0,name,email,grade,age
0,John,john@mail.com,A,16
1,Alice,alice@mail.com,B,17
2,Bob,bob@mail.com,C,16
3,Hannah,hannah@mail.com,A,17


In [6]:
# json
subjects_df = pd.read_json('/Users/nestor/Documents/Fall Module 1/Communications for Analytics/msds610-eda-pandas/students.json')
subjects_df.head()

Unnamed: 0,id,name,math,physics,chemistry
0,A001,Tom,60,66,61
1,A002,James,89,76,51
2,A003,Jenny,79,90,78


In [None]:
# html
table_MN = pd.read_html('https://en.wikipedia.org/wiki/Minnesota', match='Election results from statewide races')[0]
table_MN.head()

In [None]:
# csv
diamonds_df = pd.read_csv('/Users/martiheit/data/diamonds.csv')
diamonds_df.head()

# Exploration

In [44]:
student_info = {
    'id': [1, 2, 3, 4],
    'name': ['Marti', 'Joleena', 'Nestor', 'Faye'],
    'age': [21, 22, 23, 24],
    'previous_job': [np.nan, 'Engineer', 'Teacher', 'Engineer'],
    'birthday': ['05-12-2000', '05-12-1999', '03-22-1998', '01-01-1997'],
    'acct_balance': ['0', '120000', '40000', '140000']
}
df_student = pd.DataFrame.from_dict(student_info)
df_student.head()

Unnamed: 0,id,name,age,previous_job,birthday,acct_balance
0,1,Marti,21,,05-12-2000,0
1,2,Joleena,22,Engineer,05-12-1999,120000
2,3,Nestor,23,Teacher,03-22-1998,40000
3,4,Faye,24,Engineer,01-01-1997,140000


In [31]:
df_student.shape

(4, 6)

In [32]:
df_student.columns

Index(['id', 'name', 'age', 'previous_job', 'birthday', 'acct_balance'], dtype='object')

In [33]:
df_student.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            4 non-null      int64 
 1   name          4 non-null      object
 2   age           4 non-null      int64 
 3   previous_job  3 non-null      object
 4   birthday      4 non-null      object
 5   acct_balance  4 non-null      object
dtypes: int64(2), object(4)
memory usage: 320.0+ bytes


In [34]:
df_student.isnull()

Unnamed: 0,id,name,age,previous_job,birthday,acct_balance
0,False,False,False,True,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False


In [35]:
df_student.describe()

Unnamed: 0,id,age
count,4.0,4.0
mean,2.5,22.5
std,1.290994,1.290994
min,1.0,21.0
25%,1.75,21.75
50%,2.5,22.5
75%,3.25,23.25
max,4.0,24.0


In [36]:
df_student['previous_job'].value_counts()

Engineer    2
Teacher     1
Name: previous_job, dtype: int64

# Preprocessing

## Converting Data Types

In [45]:
df_student.dtypes

id               int64
name            object
age              int64
previous_job    object
birthday        object
acct_balance    object
dtype: object

In [46]:
df_student['acct_balance'].describe()

count     4
unique    4
top       0
freq      1
Name: acct_balance, dtype: object

In [47]:
df_student['birthday'] = pd.to_datetime(df_student['birthday'])
df_student['acct_balance'] = pd.to_numeric(df_student['acct_balance'])
df_student.dtypes

id                       int64
name                    object
age                      int64
previous_job            object
birthday        datetime64[ns]
acct_balance             int64
dtype: object

In [48]:
df_student['previous_job'] = df_student['previous_job'].fillna('Student')

In [29]:
df_student

Unnamed: 0,id,name,age,previous_job,birthday,acct_balance
0,1,Marti,21,Student,2000-05-12,0
1,2,Joleena,22,Engineer,1999-05-12,120000
2,3,Nestor,23,Teacher,1998-03-22,40000
3,4,Faye,24,Engineer,1997-01-01,140000


## Appending Data

In [None]:
df_student['gender'] = ['F', 'F', 'M', 'F']
df_student

## Removing/Reindex Data

In [None]:
df_student = df_student.drop('id', axis=1)
df_age_index = df_student.set_index('age')

In [None]:
df_age_index.head()

# Visualize

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(df_student['age'], df_student['acct_balance'])
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.set_xlabel('Age')
ax.set_ylabel('Account Balance')
plt.title('Age vs Account Balance')
plt.show()

In [None]:
x = df_student['previous_job'].value_counts().index
y = df_student['previous_job'].value_counts().values

fig, ax = plt.subplots(figsize=(6, 6))
ax.bar(x, y)
plt.title('Previous Job Frequency')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.set_xlabel('Previous Job')
ax.set_ylabel('Count')
plt.show()
