In [None]:
import pandas as pd
import numpy as np

In [None]:
file1 = pd.read_csv('/Users/marcsoler/Documents/GitHub/Ironhack_lab_work/Case Studies/Week 1/H4A/data/file1.csv')
file2 = pd.read_csv('/Users/marcsoler/Documents/GitHub/Ironhack_lab_work/Case Studies/Week 1/H4A/data/file2.txt', sep='\t')
file3 = pd.read_excel('/Users/marcsoler/Documents/GitHub/Ironhack_lab_work/Case Studies/Week 1/H4A/data/file3.xlsx')
file4 = pd.read_excel('/Users/marcsoler/Documents/GitHub/Ironhack_lab_work/Case Studies/Week 1/H4A/data/file4.xlsx')

# File description

In [None]:
file1.describe()

In [None]:
file1.tail(10)
#check for fitness of index number at the end.

In [None]:
file2.tail(10)

In [None]:
file2['GENDER'].unique()

In [None]:
file1.info()

In [None]:
file2.info()

In [None]:
file3.info()

In [None]:
file4.info()

## notes to return to later
- missing a lot of gender values
- lots of zeroes showing in columns
- dtypes discrepancies, especially in CONTROLN
- occsasional missing values

# Merging the dataframes with concat()

In [None]:
data = pd.concat([file1, file2, file3, file4], axis=0, ignore_index=True)
#after doing concat or removing rows we need to reset the index. In this case, we did it in the concat function.
#the above is the same as .reset_index()

In [None]:
data

In [None]:
data.info()

# Basic cleaning of the dataframe

## Changing the headers into lower case

In [None]:
cols = []
for i in range(len(data.columns)):
    cols.append(data.columns[i].lower())
data.columns = cols
data.columns

## Drop useless columns

In [None]:
datanew = data.drop(['controln', 'pobc2'], axis = 1)

In [None]:
datanew.info()

## Renaming columns to be more meaninful

In [None]:
new_names = {
    'hv1': 'medianhv',
    'ic5': 'percapita',
}
data.rename(columns=new_names, inplace=True)
data.head()

## Looking into datatypes
- controln should be object
- hv1 should be float
- ic5 should be float
- dob should be date

In [None]:
data.dtypes

In [None]:
data['medianhv'] = pd.to_numeric(data['medianhv'], errors='coerce')
data['percapita'] = pd.to_numeric(data['percapita'], errors='coerce')

In [None]:
data.info()

## Searching for and eliminating duplicates

In [None]:
data = data.drop_duplicates()
# without any arguments, it drops full duplicated rows.

# Filter and query data in a dataframe

In [None]:
# Method 1: query
data.query('gender == "F" & state=="CA"')

In [None]:
# Method 2: using tilde symbol ~ to do inverse filtering
female_genders = ['F', 'f', 'FEMALE','female', 'feamale']
data[~data.gender.isin(female_genders)]

In [None]:
# Method 3: iloc (by index)
data.iloc[4:6] #plus rows, columns, lists as parameters

In [None]:
# Method 4: loc (by name)
data.loc[data['gender'] == 'F']

In [None]:
# Method 5: using brackets and criteria (allows to mix string and number filters)
data[(data['state']=='FL')&(data['gender']=='M')&(data['ic1']>300)]

In [None]:
# Method 6: .filter()
data.filter(items=['state', 'gender'], axis=1)

# Dealing with NULLS

In [None]:
#snapshot table for handling nulls
nulls_df = pd.DataFrame(round(data.isna().sum()/len(data), 4) * 100)
nulls_df = nulls_df.rename(columns={'index':'header', 0: 'prop_nulls'})
nulls_df.index.name = 'header'
nulls_df

## Filling the gender column with 'unknown'

In [None]:
data['gender'] = data['gender'].fillna('Unknown')
# check it has worked: data[data['gender'].isna()==True]

In [None]:
data['gender'].value_counts()

## Filling the medianhv with the column's median

In [None]:
# First we compute the median of the column and set is as a variable
med_hv1 = data['medianhv'].median()
# Then, we fill the nulls with the computed median
data['medianhv'] = data['medianhv'].fillna(med_hv1)

In [None]:
data.info()

## Dropping the rest of NULL values

In [None]:
# data = data.dropna(subset=['percapita'])
data = data.dropna()
# After dropping nulls, always reset index
data.reset_index(drop=True, inplace=True)

In [None]:
data.info()

# Data quality issues - standardise the values in...

In [None]:
# gender
data['gender'] = data['gender'].str.title() #we eliminate some of the options by standardising the capitalisation.
data['gender'].value_counts()

In [None]:
# It can be done with a dictionary or a logical approach (if statements)
def clean_gender(x):
    if x in ['M', 'Male']:
        return 'Male'
    elif x in ['F', 'Female', 'Feamale']:
        return 'Female'
    else:
        return 'Unknown'

In [None]:
# list + map
data['gender'] = list(map(clean_gender, data['gender']))

In [None]:
data['gender'].value_counts()

# Lambda + list(map)

In [None]:
# Lambda is an anonymous function (no name). It's quick, but don't use if complex
y = lambda x: x+2
y(100)

In [None]:
# Using lambda to upper case a column
map(lambda x: x.upper(), data['gender'])
# The same could be done with string, a function, a for loop or a list comprehension

In [None]:
# square lambda to these numbers
num = [1, 2, 3, 4, 5, 6]
sq = lambda x: x**2
map(sq, num)

# list comprehension
sq_num = [x**2 for x in num]

# Date and time

In [None]:
import time
from datetime import date
datae = pd.read_csv('/Users/marcsoler/Documents/GitHub/BCNDATA0122/ClassMaterials/Week1/Day3pandas/air_quality_no2_long.csv')


In [None]:
datae.head()

In [None]:
# Parse date.utc field into DATE TYPE
datae['date.utc'] = pd.to_datetime(datae['date.utc'], errors='coerce') #to_date if we don't have time info.
datae.info()

In [None]:
# Isolate parts of the date - pull out the day, month, hour...
datae['weekday'] = pd.DatetimeIndex(datae['date.utc']).weekday
datae.head(10)

In [None]:
datae['weekday'] = datae['date.utc'].dt.strftime('%a')
datae.head()

In [None]:
# Creating a new column for date only and another for the hour
datae['date'] = datae['date.utc'].dt.strftime('%d/%m/%Y')
datae['hour'] = datae['date.utc'].dt.strftime('%H')

In [None]:
datae.head()

In [None]:
date.today()

In [None]:
time.localtime()

# Exploring and describing data
- Understanding the data in terms of the spread, variation, noisiness  
- Mean, median, mode: what they mean in each column  
- Understanding the frequency of values  
- Bucketing

**Useful functions:**
describe()  
info()  
shape  
unique()  
value_counts()  
tail()  
head()  
nunique()  
nsmallest() nlargest()  
groupby().agg()  

In [None]:
datae.describe(include=[object]) # Includes the object data type.

In [None]:
# groupby pivot fields () and a value column [literally value here] and then we can apply some calculations.
city_summary = datae.groupby('city')['value'].mean().sort_values()
city_summary

In [None]:
hour_summary = datae.groupby('hour')['value'].mean().sort_values()
hour_summary

In [None]:
type(hour_summary)

In [None]:
city_hour_summary = datae.groupby(['city','hour']).agg('mean').sort_values(by=['city', 'value'])
city_hour_summary.tail(30)

In [None]:
type(city_hour_summary)

# Visual descriptive analysis (Exploratory Descriptive Analysis)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
# or plt.show() to make vizez appear embedded in the notebook

In [None]:
# default python plotting
datae['city'].value_counts().plot(kind='bar', color='purple')

In [None]:
datae['hour'].value_counts().plot(kind='bar', color='red')

In [None]:
plt.scatter(x=datae['hour'], y=datae['value'])

In [None]:
sns.catplot(data=datae, x='hour', y='value', hue='city', palette='dark')

In [None]:
paris = datae.query('city=="Paris"')

In [None]:
plt.figure(figsize=(12,8))
plt.plot(paris['date'], paris['value'])
plt.title('Air pollution in Paris over time')
plt.xlabel('date')
plt.ylabel('no2 level')