# Import CSV into PostgreSQL database
The goal of this project is to import a CSV file into a PostgreSQL database. The steps to be followed are:
1. Understand the data from the CSV file
2. Create a SQL table that fits the data
3. Import a single value to the database
4. Import a row to the database
5. Import all the rows
Let's see which surprises await!

# Step 1: read csv

In [31]:
import psycopg2
import csv
import pandas as pd
with open('austin_weather.csv', 'r') as f:
    reader = csv.reader(f)
    columns = next(reader)
    first_row = next(reader)
    second_row = next(reader)
    print(columns)
    print(first_row)
    f.seek(0)
    for row in f:
        my_row = next(reader)
        print(len(my_row))

['Date', 'TempHighF', 'TempAvgF', 'TempLowF', 'DewPointHighF', 'DewPointAvgF', 'DewPointLowF', 'HumidityHighPercent', 'HumidityAvgPercent', 'HumidityLowPercent', 'SeaLevelPressureHighInches', 'SeaLevelPressureAvgInches', 'SeaLevelPressureLowInches', 'VisibilityHighMiles', 'VisibilityAvgMiles', 'VisibilityLowMiles', 'WindHighMPH', 'WindAvgMPH', 'WindGustMPH', 'PrecipitationSumInches', 'Events']
['2013-12-21', '74', '60', '45', '67', '49', '43', '93', '75', '57', '29.86', '29.68', '29.59', '10', '7', '2', '20', '4', '31', '0.46', 'Rain , Thunderstorm']
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21

So we can see that the .csv file contains a bunch of columns (Dat, TempHighF, etc.) with numerical data and text in the last column. The good news is that all rows contain 21 values, so no need to worry about the shape of the file. Next, let's create an SQL table with a column for each value.

# Step 2: create an SQL table which fits the data
Note that an ID column was added (which is not contained in the orginial data file). This is because the SQL database requires a unique field. We've set the ID to "serial" so that it will automatically increase when a new value is added.

In [32]:
sql_create_weather_table = """CREATE TABLE IF NOT EXISTS weather(
                                ID SERIAL PRIMARY KEY,
                                date date,
                                TempHighF integer,
                                TempAvgF integer,
                                TempLowF integer,
                                DewPointHighF integer,
                                DewPointAvgF integer,
                                DewPointLowF integer,
                                HumidityHighPercent integer,
                                HumidityAvgPercent integer,
                                HumidityLowPercent integer,
                                SeaLevelPressureHighInches real,
                                SeaLevelPressureAvgInches real,
                                SeaLevelPressureLowInches real,
                                VisibilityHighMiles integer,
                                VisibilityAvgMiles integer,
                                VisibilityLowMiles integer,
                                WindHighMPH integer,
                                WindAvgMPH integer,
                                WindGustMPH integer,
                                PrecipitationSumInches real,
                                Events text
                                )"""   

# create a connection with the database (this should already exist)
try:
    connection = psycopg2.connect("dbname='austin_weather_energy' user='muriel' host='localhost' password='1'")
    print("connected to austin_weather_energy")
except:
    print("Unable to connect to the database")
# the cursor can help us execute SQL
cursor = connection.cursor()

# delete the table if it already exists
sql = """DROP table weather"""
cursor.execute(sql)
connection.commit()

# now let's create the table
cursor.execute(sql_create_weather_table)
# and commit to the DB
connection.commit()

# next, let's print the column names to see if it worked:


def print_values():
    cursor.execute("SELECT * from weather")
    colnames = [desc[0] for desc in cursor.description]
    print("Columns in database:")
    print(colnames)
    rows = cursor.fetchall()
    print("Values in database:")
    for row in rows:
        print(" ", row)
    connection.commit()
print_values()

connected to austin_weather_energy
Columns in database:
['id', 'date', 'temphighf', 'tempavgf', 'templowf', 'dewpointhighf', 'dewpointavgf', 'dewpointlowf', 'humidityhighpercent', 'humidityavgpercent', 'humiditylowpercent', 'sealevelpressurehighinches', 'sealevelpressureavginches', 'sealevelpressurelowinches', 'visibilityhighmiles', 'visibilityavgmiles', 'visibilitylowmiles', 'windhighmph', 'windavgmph', 'windgustmph', 'precipitationsuminches', 'events']
Values in database:


# Step 3: Import a single value into the Database
Let's get started wiht the first value in the databse, make sure we can add that, then proceed to more values.

In [33]:
# now let's try to add a single value to the database
print(first_row[0])
sql = """INSERT INTO weather(date) VALUES (%r)""" %(first_row[0])
cursor.execute(sql)
connection.commit()
print_values()

2013-12-21
Columns in database:
['id', 'date', 'temphighf', 'tempavgf', 'templowf', 'dewpointhighf', 'dewpointavgf', 'dewpointlowf', 'humidityhighpercent', 'humidityavgpercent', 'humiditylowpercent', 'sealevelpressurehighinches', 'sealevelpressureavginches', 'sealevelpressurelowinches', 'visibilityhighmiles', 'visibilityavgmiles', 'visibilitylowmiles', 'windhighmph', 'windavgmph', 'windgustmph', 'precipitationsuminches', 'events']
Values in database:
  (1, datetime.date(2013, 12, 21), None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)


# Step 4: Import a row
We will split this step into two parts: First, import the values one by one to ensure there are no conflicts.

In [34]:
# next let's try to add the first row's values into the database one by one. This will help us
# understand if there are any conflicts between the data types of the csv versus the SQL database

for colname, value in zip(columns, first_row):
    sql = """INSERT INTO weather(%s) VALUES (%r)""" %(colname,value)
    #print(sql)
    cursor.execute(sql)
    connection.commit()
print_values()    

Columns in database:
['id', 'date', 'temphighf', 'tempavgf', 'templowf', 'dewpointhighf', 'dewpointavgf', 'dewpointlowf', 'humidityhighpercent', 'humidityavgpercent', 'humiditylowpercent', 'sealevelpressurehighinches', 'sealevelpressureavginches', 'sealevelpressurelowinches', 'visibilityhighmiles', 'visibilityavgmiles', 'visibilitylowmiles', 'windhighmph', 'windavgmph', 'windgustmph', 'precipitationsuminches', 'events']
Values in database:
  (1, datetime.date(2013, 12, 21), None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
  (2, datetime.date(2013, 12, 21), None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
  (3, None, 74, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
  (4, None, None, 60, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

# Determine preprocessing requirements:
trial and error and with differnt rows show us that we need to do some pre-porcessing on the data, including:
1. adding quotation marks around the "Date" column (otherwise the "-" is interpreted as a minus)
2. adding quotation marks around the "Events" column (some of the values contain commas)
3. Replacing "-" and "F" numerical values with NULL

# Step 5: Import all the rows
Now we're ready to import all the rows. First let's clean up the table we created earlier and create a new table, then we can import all the data. 

In [35]:
# delete the table if it already exists
sql = """DROP table weather"""
cursor.execute(sql)
connection.commit()

# now let's create the table
cursor.execute(sql_create_weather_table)
# and commit to the DB
connection.commit()

In [36]:
with open('austin_weather.csv', 'r') as f:
    reader = csv.reader(f)
    columns = next(reader)
    #print(columns)
    #print(len(my_row))
    f.seek(0)
    for row in f:
        my_row = next(reader)
        last_value = my_row[len(my_row)-1]
        last_value = "'"+last_value+"'"
        my_row[len(my_row)-1] = last_value
        my_row[0] = "'"+my_row[0]+"'"
        for j in range(0,len(my_row)):
            if my_row[j] == "T":
                my_row[j] = "NULL"
            if my_row[j] == "-":
                my_row[j] = "NULL"
        sql = """INSERT INTO weather({0}) VALUES ({1})"""
        sql = sql.format(','.join(columns), ','.join(my_row))
        cursor.execute(sql)
connection.commit()
# hurrah! It works :)

In [38]:
print_values()

Columns in database:
['id', 'date', 'temphighf', 'tempavgf', 'templowf', 'dewpointhighf', 'dewpointavgf', 'dewpointlowf', 'humidityhighpercent', 'humidityavgpercent', 'humiditylowpercent', 'sealevelpressurehighinches', 'sealevelpressureavginches', 'sealevelpressurelowinches', 'visibilityhighmiles', 'visibilityavgmiles', 'visibilitylowmiles', 'windhighmph', 'windavgmph', 'windgustmph', 'precipitationsuminches', 'events']
Values in database:
  (1, datetime.date(2013, 12, 21), 74, 60, 45, 67, 49, 43, 93, 75, 57, 29.86, 29.68, 29.59, 10, 7, 2, 20, 4, 31, 0.46, 'Rain , Thunderstorm')
  (2, datetime.date(2013, 12, 23), 58, 45, 32, 31, 27, 23, 76, 52, 27, 30.56, 30.49, 30.41, 10, 10, 10, 8, 3, 12, 0.0, ' ')
  (3, datetime.date(2013, 12, 25), 58, 50, 41, 44, 40, 36, 86, 71, 56, 30.41, 30.33, 30.27, 10, 10, 7, 10, 2, 16, None, ' ')
  (4, datetime.date(2013, 12, 27), 60, 53, 45, 41, 39, 37, 83, 65, 47, 30.46, 30.39, 30.34, 10, 9, 7, 7, 1, 11, None, ' ')
  (5, datetime.date(2013, 12, 29), 64, 50,

In [39]:
# and finally let's close the connection and the cursor:
connection.close()
cursor.close()