# Data Ingestion
In this stage, we load csv files temp.csv, quetions.csv, and lectures.csv to PostgreSQL

In [1]:
import pandas as pd
import psycopg

## Read csv into Pandas dataframes

df_q: questions
df_inter: interactions
df_lec: lectures
df_stu: students (student id)

In [2]:
#inital data preprocessing from student's interaction data
df = pd.read_csv('data/temp.csv')
df_inter = df.rename(columns = {'Unnamed: 0': 'interaction_id'})
df_inter['interaction_id'] = df_inter['interaction_id'] + 1

In [3]:
#files paths for lectures, questions
lec = 'data/lectures.csv'
q = 'data/questions.csv'

In [4]:
#read csv into pandas dataframe and derive a dataframe for students, the purpose is for future implementation of network features 
df_q = pd.read_csv(q)
df_lec=pd.read_csv(lec)
df_inter = df_inter[['interaction_id', 'user_id', 'timestamp', 'solving_id', 'question_id', 'user_answer', 'elapsed_time']]            
#derive student id df from interactions
df_stu = pd.DataFrame(df_inter['user_id'].drop_duplicates().reset_index(drop=True))
df_stu = df_stu.rename(columns = {'user_id': 'student_id'})

In [5]:
df_inter.head()

Unnamed: 0,interaction_id,user_id,timestamp,solving_id,question_id,user_answer,elapsed_time
0,1,235778,1533827536952,1,q8098,b,21000
1,2,235778,1533827561361,2,q8074,b,22000
2,3,235778,1533827587043,3,q176,d,23000
3,4,235778,1533827606515,4,q1279,a,17000
4,5,235778,1533827678676,5,q2067,a,22666


In [12]:
df_stu.head()

Unnamed: 0,student_id
0,235778
1,746942
2,837494
3,624986
4,289251


In [13]:
df_q.head()

Unnamed: 0,question_id,bundle_id,explanation_id,correct_answer,part,tags,deployed_at
0,q1,b1,e1,b,1,1;2;179;181,1558093217098
1,q2,b2,e2,a,1,15;2;182,1558093219720
2,q3,b3,e3,b,1,14;2;179;183,1558093222784
3,q4,b4,e4,b,1,9;2;179;184,1558093225357
4,q5,b5,e5,c,1,8;2;179;181,1558093228439


In [14]:
df_lec.head()

Unnamed: 0,lecture_id,part,tags,video_length,deployed_at
0,l520,5,142,-1,-1
1,l592,6,142,-1,-1
2,l1259,1,222,359000,1570424729123
3,l1260,1,220,487000,1570424738105
4,l1261,1,221,441000,1570424743162


In [6]:
#!pip install pandas sqlalchemy psycopg2-binary

## Load pandas dataframe to PostgreSQL

In [7]:
from sqlalchemy import create_engine, types

username = 'postgres'
password = '123'
host = 'localhost'
port = '5432'  
database = '54_proj'

engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{database}')

### define schema datatypes while loading

In [8]:
dtypes = {'question_id': types.VARCHAR(length=255), 
                      'bundle_id': types.VARCHAR(length=255),
                      'explanation_id': types.VARCHAR(length=255),
                      'correct_answer':types.CHAR(length=1),
                     'part': types.INTEGER,
                     'tags': types.VARCHAR(length=255),
                     'deployed_at': types.BIGINT}
df_q.to_sql('questions', con=engine,  if_exists='replace', index=False, chunksize=1000, dtype = dtypes)

#define constraints: question_id as a primary key 
with engine.connect() as con:
    con.execute("ALTER TABLE questions ADD PRIMARY KEY (question_id);")

In [9]:
dtypes = {'lecture_id': types.VARCHAR(length=255), 
                     'part': types.INTEGER,
                     'tags': types.VARCHAR(length=255),
                      'video_length': types.INTEGER,
                     'deployed_at': types.BIGINT}
df_lec.to_sql('lectures', con=engine,  if_exists='replace', index=False, chunksize=1000, dtype = dtypes)

#define constraints: lecture_id as a primary key 
with engine.connect() as con:
    con.execute("ALTER TABLE lectures ADD PRIMARY KEY (lecture_id);")

In [10]:
dtypes = {'student_id': types.INTEGER}
df_stu.to_sql('students', con=engine,  if_exists='replace', index=False, chunksize=1000, dtype = dtypes)

#define constraints: student_id as a primary key 
with engine.connect() as con:
    con.execute("ALTER TABLE students ADD PRIMARY KEY (student_id);")

In [11]:
dtypes = {'interaction_id': types.INTEGER,
         'user_id': types.INTEGER,
          'timestamp': types.BIGINT,
          'solving_id': types.INTEGER,
          'question_id': types.VARCHAR(length=255), 
          'user_answer':types.CHAR(length=1),
         'elapsed_time': types.INTEGER,
         'tags': types.VARCHAR(length=255),
         'video_length': types.INTEGER}
df_inter.to_sql('interactions', con=engine,  if_exists='replace', index=False, chunksize=1000, dtype = dtypes)

#define constraints: interaction_id as a primary key, user_id references student_id, question_id reference question_id
with engine.connect() as con:
    con.execute("ALTER TABLE interactions ADD PRIMARY KEY (interaction_id);")
    con.execute("ALTER TABLE interactions ADD FOREIGN KEY (user_id) REFERENCES students(student_id);")
    con.execute("ALTER TABLE interactions ADD FOREIGN KEY (question_id) REFERENCES questions(question_id);")