In [1]:
import sqlite3

from sqlalchemy import (
    create_engine,
    MetaData,
    Table,
    Column,
    String,
    Integer,
    select,
    text

)

from sqlalchemy.orm import sessionmaker

import pandas as pd     

import numpy as np

In [7]:
df = pd.read_json('catalog.json').T

In [17]:

print(df.columns)

Index(['coi_data', 'concurrent_courses', 'course_campus', 'course_credits',
       'course_description', 'course_id', 'course_offered', 'course_title',
       'department_abbrev', 'gpa_distro', 'is_bottleneck', 'is_gateway',
       'prereq_graph', 'prereq_string'],
      dtype='object')


In [8]:
#course offered contains values outside of seasons

unique_quarters = df['course_offered'].unique()

print([quarter for quarter in unique_quarters if quarter is not None and len(quarter) < 9])


['S.', 'AS.', 'W.', 'Sp.', 'A.', 'AWSpS.', 'WSp.', 'AWSp.', 'AWS.', 'SpS.', 'ASpS.', 'AW.', 'WS.', 'ASp.', 'WSpS.']


In [6]:
#filter out values that are not seasons
seasons = ['S.', 'A.', 'W.', 'Sp.', 'AS.']

df = df[df['course_offered'].isin(seasons)]

print(len(df['course_offered'].isin(seasons)))


878


In [9]:
seasons = ['S.', 'A.', 'W.', 'Sp.']
print(df['course_offered'].isin(seasons).sum())

865


In [10]:
#confirm class id is unique
print(len(df))
print(df['course_id'].nunique())

5777
5777


In [11]:
#set up gpa 

gpa_data = df['gpa_distro'].apply(pd.Series)
gpa_data = gpa_data.applymap(lambda x: x.get('count', 0) if isinstance(x, (dict,)) else None)
print(gpa_data.columns)
print(gpa_data.head())

RangeIndex(start=0, stop=41, step=1)
         0    1    2    3    4    5    6    7    8    9   ...   31   32   33  \
SWA101  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  2.0  2.0  1.0   
SWA102  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  2.0  ...  1.0  3.0  4.0   
SWA103  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  3.0  4.0  2.0   
SWA134  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  1.0  0.0   
SWA201  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

         34   35   36    37    38    39    40  
SWA101  2.0  7.0  9.0   7.0  21.0  15.0  88.0  
SWA102  2.0  8.0  6.0  10.0  12.0  18.0  79.0  
SWA103  0.0  5.0  4.0   5.0   3.0   9.0  39.0  
SWA134  0.0  0.0  0.0   0.0   0.0   1.0  22.0  
SWA201  0.0  0.0  0.0   0.0   0.0   0.0  12.0  

[5 rows x 41 columns]


  gpa_data = gpa_data.applymap(lambda x: x.get('count', 0) if isinstance(x, (dict,)) else None)


In [12]:
#calculating mean

gpa_values = np.array([i / 10 for i in range(41)], dtype = float)

total_counts = gpa_data[[i for i in range(41)]].sum(axis=1)

mean_gpa = (gpa_data * gpa_values).sum(axis=1) / total_counts
mean_gpa.name = 'mean_gpa'



In [13]:
df_mean_gpa = pd.concat([df, mean_gpa], axis=1)
print(df_mean_gpa.columns)
print(type(mean_gpa))

Index(['coi_data', 'concurrent_courses', 'course_campus', 'course_credits',
       'course_description', 'course_id', 'course_offered', 'course_title',
       'department_abbrev', 'gpa_distro', 'is_bottleneck', 'is_gateway',
       'prereq_graph', 'prereq_string', 'mean_gpa'],
      dtype='object')
<class 'pandas.core.series.Series'>


In [14]:
df_dropped = df_mean_gpa.drop(columns=['gpa_distro', 'concurrent_courses', 'prereq_graph', 'coi_data', 'is_gateway', 'is_bottleneck', 'prereq_string'], inplace = False)
df_final = df_dropped.rename(columns = {'course_offered': 'quarters_offered'})


In [None]:
print(df_final.columns)
print(df_final.dtypes)
print(df_final['course_id'].head())
print(type(df_final['course_credits']))
print(df_final.info())

In [15]:
print(df_final.loc[df_final['department_abbrev'].str.contains('CS')]['department_abbrev'].unique())
print(df_final['course_credits'].head())

['CSE' 'ETHICS']
SWA101     5.0
SWA102     5.0
SWA103     5.0
SWA134    15.0
SWA201     5.0
Name: course_credits, dtype: object


In [None]:
# Connect to the SQLite database
conn = sqlite3.connect('courses_temp.db')
cursor = conn.cursor()

# Define table name
table_name = 'class_data'  # Adjust this to match your actual table name


# Insert DataFrame into the SQLite table
df_final.to_sql(table_name, conn, index=False)

In [16]:
departments = df_final['department_abbrev'].unique()
print(departments)

print('CSE' in departments)

['SWA' 'ARCTIC' 'TAGLG' 'AAS' 'CHSTU' 'BENG' 'CFRM' 'AFRAM' 'AES' 'ASIAN'
 'AMATH' 'ART' 'DESIGN' 'AIS' 'INDO' 'INDN' 'ARCHY' 'URDU' 'HINDI' 'SNKRT'
 'CHIN' 'ASTBIO' 'KOREAN' 'HUM' 'VIET' 'JAPAN' 'ASTR' 'CMS' 'GREEK' 'CLAS'
 'LATIN' 'DXARTS' 'ANTH' 'INDIV' 'CHID' 'CHEM' 'TXTDS' 'BIOL' 'DANCE'
 'COM' 'FRENCH' 'ITAL' 'HSTAFM' 'DRAMA' 'CSE' 'ECON' 'HSTLAC' 'HSTAM'
 'HPS' 'HSTRY' 'GWSS' 'HSTAS' 'HSTCMP' 'LABOR' 'GEOG' 'ASL' 'HSTEU'
 'GERMAN' 'RELIG' 'ARAMIC' 'COPTIC' 'GEEZ' 'EGYPT' 'LADINO' 'MICROM' 'LSJ'
 'JSIS' 'ARAB' 'TURKIC' 'CHGTAI' 'LING' 'BIBHEB' 'MODHEB' 'PRSAN' 'MATH'
 'ENGL' 'KYRGYZ' 'KAZAKH' 'UGARIT' 'UZBEK' 'UYGUR' 'TKISH' 'MUSAP' 'MUSED'
 'NEUSCI' 'ETHICS' 'MUSEN' 'MUHST' 'MELC' 'FINN' 'DANISH' 'ESTO' 'LITH'
 'LATV' 'NORW' 'BCMS' 'SWED' 'CZECH' 'BULGR' 'MUSICP' 'ROMN' 'PHIL'
 'POLSH' 'GLITS' 'UKR' 'SLVN' 'PORT' 'SCAND' 'PHYS' 'SPLING' 'MUSIC'
 'SLAVIC' 'RUSS' 'PSYCH' 'CEP' 'CM' 'SPHSC' 'STAT' 'ACCTG' 'ARCH' 'ENTRE'
 'SOC' 'FIN' 'SPAN' 'QMETH' 'OPMGT' 'MGMT' 'DPHS' 'MKTG' 'ORAL

In [18]:
print(df_final['course_id'].head())

SWA101    SWA 101
SWA102    SWA 102
SWA103    SWA 103
SWA134    SWA 134
SWA201    SWA 201
Name: course_id, dtype: object
