### Reading Data with explicit schema

In [21]:
import csv
import pandas as pd

#Define the input schema
schema = {
    'name': str,
    'age': int,
    'gender': str,
    'city': str,
    'country': str
}

#Open the CSV file
with open('data.csv', newline='') as csvfile:
    # Create a CSV reader object
    
    reader = csv.DictReader(csvfile)
    # Loop through each row in the CSV file
    
    for row in reader:
        # Parse the row using the input schema
        parsed_row = {key: schema[key](row[key]) for key in schema}
        # Process the parsed row
        print(parsed_row)

{'name': 'Alice', 'age': 28, 'gender': 'F', 'city': 'New York', 'country': 'USA'}
{'name': 'Bob', 'age': 35, 'gender': 'M', 'city': 'San Francisco', 'country': 'USA'}
{'name': 'Charlie', 'age': 42, 'gender': 'M', 'city': 'London', 'country': 'UK'}
{'name': 'Diana', 'age': 19, 'gender': 'F', 'city': 'Paris', 'country': 'France'}
{'name': 'Emily', 'age': 25, 'gender': 'F', 'city': 'Berlin', 'country': 'Germany'}


### Reading data without explicit schema

In [23]:
import csv
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('data_new.csv')

# Display the DataFrame
print(df)

      name  age gender           city  country       state
0    Alice   28      F       New York      USA          TX
1      Bob   35      M  San Francisco      USA          KY
2  Charlie   42      M         London       UK    Coventry
3    Diana   19      F          Paris   France       Paris
4    Emily   25      F         Berlin  Germany  Struttgart


### Data Cleanising

In [24]:
import csv
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('data_blanks.csv')

# Display the DataFrame
print(df)

      name   age gender           city  country       state
0    Alice  28.0      F       New York      USA          TX
1      Bob  35.0      M  San Francisco      USA          KY
2  Charlie  42.0      M         London       UK    Coventry
3    Diana  19.0      F          Paris   France       Paris
4    Emily  25.0      F         Berlin  Germany  Struttgart
5     Jack   NaN      F       New York      USA          WA
6     Jill   NaN      M  San Francisco      USA          NY


In [25]:
#Read in the data
df = pd.read_csv('data_blanks.csv')
#Replace missing values with the mean of the column
df.fillna(df.mean(), inplace=True)

print(df)

      name   age gender           city  country       state
0    Alice  28.0      F       New York      USA          TX
1      Bob  35.0      M  San Francisco      USA          KY
2  Charlie  42.0      M         London       UK    Coventry
3    Diana  19.0      F          Paris   France       Paris
4    Emily  25.0      F         Berlin  Germany  Struttgart
5     Jack  29.8      F       New York      USA          WA
6     Jill  29.8      M  San Francisco      USA          NY


  df.fillna(df.mean(), inplace=True)


### Data transformation

In [26]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('data_date.csv')

# Display the DataFrame
print(df)

print("\n\n")
print(df.dtypes)

      name date_of_birth   age  gender      city    country
0    Alice    1990-01-01  32.5  female  New York        USA
1      Bob    1985-05-15  37.2    male    London         UK
2  Charlie    1995-12-31  26.8    male     Paris     France
3    David    1980-03-20  41.0    male    Sydney  Australia
4      Eve    1998-11-11  22.1  female     Tokyo      Japan



name              object
date_of_birth     object
age              float64
gender            object
city              object
country           object
dtype: object


In [27]:
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])

# Display the DataFrame
print(df)
print("\n\n")
print(df.dtypes)

      name date_of_birth   age  gender      city    country
0    Alice    1990-01-01  32.5  female  New York        USA
1      Bob    1985-05-15  37.2    male    London         UK
2  Charlie    1995-12-31  26.8    male     Paris     France
3    David    1980-03-20  41.0    male    Sydney  Australia
4      Eve    1998-11-11  22.1  female     Tokyo      Japan



name                     object
date_of_birth    datetime64[ns]
age                     float64
gender                   object
city                     object
country                  object
dtype: object


### Data Enrichment

In [28]:
df1 = pd.read_csv('emp.csv')

# Display the DataFrame
print(df1)


print("\n\n")


df2 = pd.read_csv('emp_sal.csv')

# Display the DataFrame
print(df2)

   id  name  age    country
0   1  John   25        USA
1   2  Jane   30     Canada
2   3  Jack   35         UK
3   4  Jill   40  Australia



   id  salary department
0   1   50000         IT
1   2   60000         HR
2   3   70000    Finance
3   5   80000      Sales


In [29]:
# Join the two data frames on the 'id' column
merged_df = pd.merge(df1, df2, on='id', how='left')

# Display the merged data frame
print(merged_df)

   id  name  age    country   salary department
0   1  John   25        USA  50000.0         IT
1   2  Jane   30     Canada  60000.0         HR
2   3  Jack   35         UK  70000.0    Finance
3   4  Jill   40  Australia      NaN        NaN


### ETL

In [30]:
# Load the data into a database
from sqlalchemy import create_engine
engine = create_engine('postgresql://local_user:mypassword@localhost/postgres')
merged_df.to_sql('merged', engine, if_exists='replace')

4