## DATA CLEANING & ETL - FINAL

#### Preparations

In [1]:
import os
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#### File Directory

In [2]:
#file directory: 'E:/2022 Summer/5310 SQL/project/kaggle_raw'
os.chdir('E:/2022 Summer/5310 SQL/project/kaggle_raw')
os.getcwd()

'E:\\2022 Summer\\5310 SQL\\project\\kaggle_raw'

### Table1: user

To create user table, we firstly need to merge the original table (Users and UserAchievements together). Since the original file both contain a common column 'user_id', but some id does not exist in both tables. Therefore, to ensure that we sucessfully push the files to the database, we need to merge tables first and then split into 3 tables: user, achievement, user_achievement.

In [3]:
# read data: Users
udf = pd.read_csv('Users.csv')
# extract useful columns and rename
temp_u = udf[['Id', 'UserName', 'PerformanceTier']]
temp_u.columns = ['user_id', 'user_name', 'user_performance_tier']
temp_u.head()

Unnamed: 0,user_id,user_name,user_performance_tier
0,1,kaggleteam,5
1,368,antgoldbloom,2
2,381,iguyon,2
3,383,davidstephan,0
4,384,gabewarren,0


In [4]:
# read data: UserAchievements
adf = pd.read_csv('UserAchievements.csv', encoding = 'latin1')
# extract useful columns and rename
temp_au = adf[['UserId', 'AchievementType']]
temp_au.columns = ['user_id', 'achievement_type']
temp_au.head()

Unnamed: 0,user_id,achievement_type
0,1,Competitions
1,1,Discussion
2,368,Competitions
3,368,Scripts
4,368,Discussion


In [5]:
# merge 2 tables on user_id
uadf = temp_u.merge(temp_au, on='user_id', how = 'outer')
uadf.head()

Unnamed: 0,user_id,user_name,user_performance_tier,achievement_type
0,1,kaggleteam,5.0,Competitions
1,1,kaggleteam,5.0,Discussion
2,1,kaggleteam,5.0,Scripts
3,1,kaggleteam,5.0,Datasets
4,368,antgoldbloom,2.0,Competitions


In [6]:
# drop NA
uadf = uadf.dropna()
# sample and keep 10% of the data
uadf10 = uadf.iloc[:413483, :]

In [8]:
# from uadf10 to table1: users
temp_users = uadf10[['user_id', 'user_name', 'user_performance_tier']]
users = temp_users.drop_duplicates(subset = 'user_id')
users.head()

Unnamed: 0,user_id,user_name,user_performance_tier
0,1,kaggleteam,5.0
1,1,kaggleteam,5.0
2,1,kaggleteam,5.0
3,1,kaggleteam,5.0
4,368,antgoldbloom,2.0


In [10]:
# save to csv
users.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/users.csv',index=False)

### Table2: achievement

In [11]:
# from uadf10 to table2: achievement
# only unique achievement_type
temp_ac = pd.DataFrame(uadf10['achievement_type'].unique(), columns = ['achievement_type'])
temp_ac.head()

Unnamed: 0,achievement_type
0,Competitions
1,Discussion
2,Scripts
3,Datasets


In [12]:
# add column 'achievement_id' and insert new values
temp_ac.insert(0, 'achievement_id', range(1, 1+len(temp_ac)))
achievement = temp_ac
achievement

Unnamed: 0,achievement_id,achievement_type
0,1,Competitions
1,2,Discussion
2,3,Scripts
3,4,Datasets


In [13]:
# save to csv
achievement.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/achievement.csv',index=False)

### Table3: user_achievement

In [14]:
# from uadf10 to table3: user_achievement
# select user_id and achievement_type
ua = uadf10[['user_id', 'achievement_type']]
ua.head()

Unnamed: 0,user_id,achievement_type
0,1,Competitions
1,1,Discussion
2,1,Scripts
3,1,Datasets
4,368,Competitions


In [15]:
# Add pre-created column achievement_id
# competitions = 1, discussion = 2, scripts = 3, datasets = 4
ua2 = ua
ua_conditions = [
    (ua2['achievement_type'] == 'Competitions'),
    (ua2['achievement_type'] == 'Discussion'),
    (ua2['achievement_type'] == 'Scripts'),
    (ua2['achievement_type'] == 'Datasets'),
]

ua_values = [1,2,3,4]

ua2['achievement_id'] = np.select(ua_conditions, ua_values)
ua2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ua2['achievement_id'] = np.select(ua_conditions, ua_values)


Unnamed: 0,user_id,achievement_type,achievement_id
0,1,Competitions,1
1,1,Discussion,2
2,1,Scripts,3
3,1,Datasets,4
4,368,Competitions,1


In [16]:
#drop achievement_type to normalize data
user_achievement = ua2.drop(columns = ['achievement_type'])
user_achievement.head()

Unnamed: 0,user_id,achievement_id
0,1,1
1,1,2
2,1,3
3,1,4
4,368,1


In [17]:
# save to csv
user_achievement.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/user_achievement.csv',index=False)

### Table 4: competition

In [18]:
# read data: Competitions
comp_df = pd.read_csv('Competitions.csv')
# extract useful columns and rename
competition = comp_df[['Id', 'Slug', 'Title', 'Subtitle', 'DeadlineDate','HasLeaderboard', 'MaxDailySubmissions',
                      'MaxTeamSize', 'RewardType', 'RewardQuantity', 'TotalTeams', 'TotalSubmissions']]
competition.columns = ['competition_id', 'slug', 'title', 'subtitle', 'deadline_date','has_leaderboard',
                      'max_dailysub', 'max_teamsize', 'reward_type', 'reward_quantity',
                      'total_teams', 'total_subs']
competition.head()

Unnamed: 0,competition_id,slug,title,subtitle,deadline_date,has_leaderboard,max_dailysub,max_teamsize,reward_type,reward_quantity,total_teams,total_subs
0,2408,Eurovision2010,Forecast Eurovision Voting,This competition requires contestants to forec...,05/25/2010 18:00:00,False,5,20,USD,1000.0,22,22
1,2435,hivprogression,Predict HIV Progression,This contest requires competitors to predict t...,08/02/2010 12:32:00,True,4,20,USD,500.0,107,855
2,2438,worldcup2010,World Cup 2010 - Take on the Quants,Quants at Goldman Sachs and JP Morgan have mod...,06/11/2010 13:29:00,False,5,20,USD,100.0,0,0
3,2439,informs2010,INFORMS Data Mining Contest 2010,The goal of this contest is to predict short t...,10/10/2010 02:28:00,True,5,20,USD,0.0,145,1483
4,2442,worldcupconf,World Cup 2010 - Confidence Challenge,The Confidence Challenge requires competitors ...,06/11/2010 13:28:00,False,5,20,USD,100.0,63,63


In [19]:
# save to csv
competition.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/competition.csv',index=False)

### Table 5: team

Again we need to merge team and submission tables into a single table called tsdf, for normolization purpose.

In [20]:
# read data: Teams & Submissions
tdf = pd.read_csv('Teams.csv')
sdf = pd.read_csv('Submissions.csv')

# extract useful columns and rename - teams
temp_t = tdf[['Id', 'TeamName', 'PublicLeaderboardRank', 'PrivateLeaderboardRank']]
temp_t.columns = ['team_id', 'team_name', 'public_leaderboard_rank', 'private_leaderboard_rank']

# extract useful columns and rename - submissions
temp_s = sdf[['Id', 'TeamId', 'SubmissionDate', 'IsAfterDeadline', 'PublicScoreLeaderboardDisplay']]
temp_s.columns = ['submission_id', 'team_id', 'submission_date', 'is_after_deadline', 'public_leaderboard_score']


In [21]:
# merge 2 tables on team_id innner join
tsdf = temp_t.merge(temp_s, on = 'team_id', how = 'inner')
tsdf.head()

Unnamed: 0,team_id,team_name,public_leaderboard_rank,private_leaderboard_rank,submission_id,submission_date,is_after_deadline,public_leaderboard_score
0,496,team1,59.0,83.0,2180,04/29/2010,False,55.76919
1,496,team1,59.0,83.0,2192,05/04/2010,False,57.21149
2,497,jonp,41.0,25.0,2181,04/30/2010,False,47.11539
3,497,jonp,41.0,25.0,2182,04/30/2010,False,61.0577
4,499,Bwaas,102.0,100.0,2184,05/01/2010,False,47.11539


In [22]:
# drop NA
temp_ts = tsdf[tsdf['team_name'].notna()]
temp_ts.head()

Unnamed: 0,team_id,team_name,public_leaderboard_rank,private_leaderboard_rank,submission_id,submission_date,is_after_deadline,public_leaderboard_score
0,496,team1,59.0,83.0,2180,04/29/2010,False,55.76919
1,496,team1,59.0,83.0,2192,05/04/2010,False,57.21149
2,497,jonp,41.0,25.0,2181,04/30/2010,False,47.11539
3,497,jonp,41.0,25.0,2182,04/30/2010,False,61.0577
4,499,Bwaas,102.0,100.0,2184,05/01/2010,False,47.11539


In [23]:
# sample 20% of the data
ts20 = temp_ts.iloc[:221023, :]

In [69]:
# get table 5: teams
t_team = ts20[['team_id', 'team_name', 'public_leaderboard_rank', 'private_leaderboard_rank']]
team = t_team.drop_duplicates()
team.head()

Unnamed: 0,team_id,team_name,public_leaderboard_rank,private_leaderboard_rank
0,496,team1,59.0,83.0
2,497,jonp,41.0,25.0
4,499,Bwaas,102.0,100.0
5,500,Thylacoleo,31.0,23.0
9,501,pjonesdotcda,67.0,80.0


In [70]:
# save to csv
team.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/team.csv',index=False)

### Table 6: submission

In [None]:
# based on ts20, get table 6: submissions
submission = ts20[['submission_id', 'team_id', 'submission_date', 'is_after_deadline', 'public_leaderboard_score']]
submission.head()

In [30]:
# save to csv
submission.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/submission.csv',index=False)

### Table 7: team_submission

In [31]:
# based on ts20, get table 7: team_submission
team_submission = ts20[['team_id', 'submission_id']]
team_submission.head()

Unnamed: 0,team_id,submission_id
0,496,2180
1,496,2192
2,497,2181
3,497,2182
4,499,2184


In [32]:
# save to csv
team_submission.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/team_submission.csv',index=False)

### Table 8: algorithm
#### competition_id, EvaluationAlgorithmAbbreviation, EvaluationAlgorithmName, EvaluationAlgorithmDescription

In [33]:
## Id for competition_id
## this table will split into 2 tables: competition_algorithm, algorithm
## generate algorithm_id based on unique abbr

#read data: Competitions
temp_com_al_df = pd.read_csv('Competitions.csv')
# extract useful columns and rename
temp_com_al_df = temp_com_al_df[['Id', 'EvaluationAlgorithmAbbreviation', 'EvaluationAlgorithmName', 'EvaluationAlgorithmDescription']]
temp_com_al_df.columns = ['competition_id', 'algorithm_abbr', 'algorithm_name', 'algorithm_descrip']
temp_com_al_df.head()

Unnamed: 0,competition_id,algorithm_abbr,algorithm_name,algorithm_descrip
0,2408,AE,Absolute Error,Total sum of absolute value of each individual...
1,2435,MCE,Mean Consequential Error,Averages consequential error (i.e. if predicti...
2,2438,Custom,Custom Evaluation Metric,A placeholder that indicates a custom algorith...
3,2439,AUC,Area Under Receiver Operating Characteristic C...,Measures discrimination. Calculates how well a...
4,2442,Custom,Custom Evaluation Metric,A placeholder that indicates a custom algorith...


In [34]:
# new column '_ID' convert algorithm_abbr to string
temp_com_al_df['_ID'] = temp_com_al_df['algorithm_abbr'].astype(str)
# factorize algorithm_abbr
temp_com_al_df['unique_id'] = pd.factorize(temp_com_al_df['_ID'])[0]
# assign algorithm_id to algorithm_abbr based on factorized algorithm_abbr
temp_com_al_df['algorithm_id'] = temp_com_al_df['unique_id'] + 1
temp_com_al_df

Unnamed: 0,competition_id,algorithm_abbr,algorithm_name,algorithm_descrip,_ID,unique_id,algorithm_id
0,2408,AE,Absolute Error,Total sum of absolute value of each individual...,AE,0,1
1,2435,MCE,Mean Consequential Error,Averages consequential error (i.e. if predicti...,MCE,1,2
2,2438,Custom,Custom Evaluation Metric,A placeholder that indicates a custom algorith...,Custom,2,3
3,2439,AUC,Area Under Receiver Operating Characteristic C...,Measures discrimination. Calculates how well a...,AUC,3,4
4,2442,Custom,Custom Evaluation Metric,A placeholder that indicates a custom algorith...,Custom,2,3
...,...,...,...,...,...,...,...
5502,36048,RMSE,Root Mean Squared Error,Square root of the average of the squared errors.,RMSE,4,5
5503,36060,CategorizationAccuracy,Categorization Accuracy,Percentage of correctly categorized items,CategorizationAccuracy,19,20
5504,36062,RMSE,Root Mean Squared Error,Square root of the average of the squared errors.,RMSE,4,5
5505,36068,CategorizationAccuracy,Categorization Accuracy,Percentage of correctly categorized items,CategorizationAccuracy,19,20


In [35]:
# drop reapted columns
temp_com_al_df2 = temp_com_al_df.drop(['unique_id', '_ID'], axis = 1)
temp_com_al_df2

Unnamed: 0,competition_id,algorithm_abbr,algorithm_name,algorithm_descrip,algorithm_id
0,2408,AE,Absolute Error,Total sum of absolute value of each individual...,1
1,2435,MCE,Mean Consequential Error,Averages consequential error (i.e. if predicti...,2
2,2438,Custom,Custom Evaluation Metric,A placeholder that indicates a custom algorith...,3
3,2439,AUC,Area Under Receiver Operating Characteristic C...,Measures discrimination. Calculates how well a...,4
4,2442,Custom,Custom Evaluation Metric,A placeholder that indicates a custom algorith...,3
...,...,...,...,...,...
5502,36048,RMSE,Root Mean Squared Error,Square root of the average of the squared errors.,5
5503,36060,CategorizationAccuracy,Categorization Accuracy,Percentage of correctly categorized items,20
5504,36062,RMSE,Root Mean Squared Error,Square root of the average of the squared errors.,5
5505,36068,CategorizationAccuracy,Categorization Accuracy,Percentage of correctly categorized items,20


In [36]:
# from temp_com_al_df2 to table8: algorithm
alg_df = temp_com_al_df2[['algorithm_id','algorithm_abbr','algorithm_name','algorithm_descrip']]
algorithm = alg_df.drop_duplicates()

In [38]:
algorithm.head()

Unnamed: 0,algorithm_id,algorithm_abbr,algorithm_name,algorithm_descrip
0,1,AE,Absolute Error,Total sum of absolute value of each individual...
1,2,MCE,Mean Consequential Error,Averages consequential error (i.e. if predicti...
2,3,Custom,Custom Evaluation Metric,A placeholder that indicates a custom algorith...
3,4,AUC,Area Under Receiver Operating Characteristic C...,Measures discrimination. Calculates how well a...
6,5,RMSE,Root Mean Squared Error,Square root of the average of the squared errors.


In [39]:
# save to csv
algorithm.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/algorithm.csv',index=False)

### Table 9: competition_algorithm

In [40]:
# from temp_com_al_df2 to table9: competition_algorithm
competition_algorithm = temp_com_al_df2[['competition_id', 'algorithm_id']]

In [41]:
competition_algorithm.head()

Unnamed: 0,competition_id,algorithm_id
0,2408,1
1,2435,2
2,2438,3
3,2439,4
4,2442,3


In [42]:
#save to csv
competition_algorithm.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/competition_algorithm.csv',index=False)

### Table 10: tag

In [43]:
# read data: Tags
tag_df = pd.read_csv('Tags.csv')
# extract useful columns and rename
tag = tag_df[['Id', 'Name', 'FullPath', 'Description', 'DatasetCount', 'CompetitionCount', 'KernelCount']]
tag.columns = ['tag_id', 'tag_name', 'fullpath', 'tag_descrip', 'datasetcount', 'competitioncount', 'kernelcount']
tag.head()

Unnamed: 0,tag_id,tag_name,fullpath,tag_descrip,datasetcount,competitioncount,kernelcount
0,1115,websites,subject > science and technology > internet > ...,"Websites, like this one, are collections of co...",223,0,186
1,1219,research,subject > science and technology > research,Research is our endeavor to systematically inc...,356,6,259
2,1220,search engines,subject > science and technology > internet > ...,,59,0,69
3,1222,universities and colleges,subject > people and society > education > uni...,This tag contains all kinds of information abo...,3624,0,379
4,2100,culture and humanities,subject > culture and humanities,What is it to be human? What activities and pa...,177,0,63


In [44]:
# save to csv
tag.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/tag.csv',index=False)

### Table 11: competition_tag

In [45]:
# read data: competition_tag
com_tag_df = pd.read_csv('CompetitionTags.csv')
# extract useful columns and rename
competition_tag = com_tag_df[['CompetitionId', 'TagId']]
competition_tag.columns = ['competition_id', 'tag_id']
competition_tag.head()

Unnamed: 0,competition_id,tag_id
0,3486,14102
1,3526,12116
2,3526,14101
3,3526,14104
4,3706,2606


In [46]:
# save to csv
competition_tag.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/competition_tag.csv',index=False)

### Table 12: organization

In [47]:
# read data: Organizations
org_df = pd.read_csv('Organizations.csv')
# extract useful columns and rename
organization = org_df[['Id', 'Name', 'Description']]
organization.columns = ['organization_id', 'organization_name', 'organization_descrip']
organization.head()

Unnamed: 0,organization_id,organization_name,organization_descrip
0,2,Facebook,Facebook was built to help people connect and ...
1,3,Figure Eight,[Figure Eight](https://www.figure-eight.com/) ...
2,4,Kaggle,Kaggle is a community of data scientists and d...
3,5,Last-Place Ltd.,
4,6,CWILOC,[Climatological Database for the World's Ocean...


In [48]:
# save to csv
organization.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/organization.csv',index=False)

### Table 13: competition_organization

In [49]:
# from comp_df to table13: competition_organization
competition_organization = comp_df[['Id', 'OrganizationId']]
competition_organization.columns = ['competition_id', 'organization_id']
competition_organization.head()

Unnamed: 0,competition_id,organization_id
0,2408,
1,2435,
2,2438,
3,2439,
4,2442,


In [50]:
# save to csv
competition_organization.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/competition_organization.csv',index=False)

### Table 14: host

From original dataset competitions, we will further normalize it into four tables: host, category, competition_host, competition_category.

In [51]:
# read data: Competitions
tem_com_ho_df = pd.read_csv('Competitions.csv')
# extract useful columns and rename
tem_com_ho_df = tem_com_ho_df[['Id', 'HostName', 'HostSegmentTitle']]
tem_com_ho_df.columns = ['competition_id', 'host_name', 'category_name']
tem_com_ho_df

Unnamed: 0,competition_id,host_name,category_name
0,2408,,Featured
1,2435,,Featured
2,2438,,Featured
3,2439,,Featured
4,2442,,Featured
...,...,...,...
5502,36048,,InClass
5503,36060,,InClass
5504,36062,,InClass
5505,36068,,InClass


In [52]:
# factorize host_name and assign new host_id
tem_com_ho_df['_HID'] = tem_com_ho_df['host_name'].astype(str)
tem_com_ho_df['host_id'] = pd.factorize(tem_com_ho_df['_HID'])[0]
tem_com_ho_df

Unnamed: 0,competition_id,host_name,category_name,_HID,host_id
0,2408,,Featured,,0
1,2435,,Featured,,0
2,2438,,Featured,,0
3,2439,,Featured,,0
4,2442,,Featured,,0
...,...,...,...,...,...
5502,36048,,InClass,,0
5503,36060,,InClass,,0
5504,36062,,InClass,,0
5505,36068,,InClass,,0


In [53]:
# from tem_com_ho_df to table14: host
ho_df = tem_com_ho_df[['host_id', 'host_name']]
# drop na
ho = ho_df.dropna()
# drop duplicated rows
host = ho.drop_duplicates()
host.head()

Unnamed: 0,host_id,host_name
14,1,NASA & the Royal Astronomical Society
17,2,Heritage Provider Network
19,3,Allstate
24,4,UCL
36,5,UCI


In [None]:
# save to csv
host.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/host.csv',index=False)

### Table 15: competition_host

In [55]:
# from tem_com_ho_df to table15: competition_host
competition_host = tem_com_ho_df[['competition_id', 'host_id']]
# replace 0 with NaN
competition_host['host_id'].replace(0, np.nan, inplace = True)
competition_host.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  competition_host['host_id'].replace(0, np.nan, inplace = True)


Unnamed: 0,competition_id,host_id
0,2408,
1,2435,
2,2438,
3,2439,
4,2442,


In [56]:
# save to csv
competition_host.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/competition_host.csv',index=False)

### Table 16: category

In [57]:
tem_ca = tem_com_ho_df
tem_ca

Unnamed: 0,competition_id,host_name,category_name,_HID,host_id
0,2408,,Featured,,0
1,2435,,Featured,,0
2,2438,,Featured,,0
3,2439,,Featured,,0
4,2442,,Featured,,0
...,...,...,...,...,...
5502,36048,,InClass,,0
5503,36060,,InClass,,0
5504,36062,,InClass,,0
5505,36068,,InClass,,0


In [58]:
# from tem_com_ho_df to table16: category
# factorize category_name and assign new category_id
tem_ca['_CID'] = tem_ca['category_name'].astype(str)
tem_ca['temp_ca_id'] = pd.factorize(tem_ca['_CID'])[0]
tem_ca['category_id'] = tem_ca['temp_ca_id'] +1
tem_ca

Unnamed: 0,competition_id,host_name,category_name,_HID,host_id,_CID,temp_ca_id,category_id
0,2408,,Featured,,0,Featured,0,1
1,2435,,Featured,,0,Featured,0,1
2,2438,,Featured,,0,Featured,0,1
3,2439,,Featured,,0,Featured,0,1
4,2442,,Featured,,0,Featured,0,1
...,...,...,...,...,...,...,...,...
5502,36048,,InClass,,0,InClass,1,2
5503,36060,,InClass,,0,InClass,1,2
5504,36062,,InClass,,0,InClass,1,2
5505,36068,,InClass,,0,InClass,1,2


In [59]:
# extract useful columns and rename
ca_df = tem_ca[['category_id', 'category_name']]
ca_df

Unnamed: 0,category_id,category_name
0,1,Featured
1,1,Featured
2,1,Featured
3,1,Featured
4,1,Featured
...,...,...
5502,2,InClass
5503,2,InClass
5504,2,InClass
5505,2,InClass


In [60]:
# drop duplicated rows
category = ca_df.drop_duplicates()

In [61]:
# a total of 8 categories
category

Unnamed: 0,category_id,category_name
0,1,Featured
23,2,InClass
38,3,Research
42,4,Prospect
55,5,Recruitment
81,6,GE Quests
95,7,Getting Started
101,8,Playground


In [62]:
# save to csv
category.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/category.csv',index=False)

### Table 17: competition_category

In [63]:
# from tem_com_ho_df to table17: competition_category
competition_category = tem_ca[['competition_id', 'category_id']]
competition_category.head()

Unnamed: 0,competition_id,category_id
0,2408,1
1,2435,1
2,2438,1
3,2439,1
4,2442,1


In [64]:
# save to csv
competition_category.to_csv('E:/2022 Summer/5310 SQL/project/clean_final/competition_category.csv',index=False)