# Merge multiple data files for SQL Server database

It is easily to insert into database one file instead of files by years. By this code can merge files to one for insertind db. The code is suitable for files Alueet and Osalliset, because their size is reasonably small and no contains information by years. 

In [11]:
# Used libraries
import pandas as pd
import numpy as np
import glob

#### Define parameters of the source and result files

In [12]:
# Source file
src_type = 'alue'
id_col = 'alue_id'

# File path and name
src_file = 'tielonnett_*_'+src_type+'.csv'
result_file = '../datasets/onnettomuus/db/tielonnett_'+src_type+'_all.csv'

# File attributes
src_delimiter = ';'
src_encoding = 'utf-8'
rslt_delimiter = ';'
rslt_encoding = 'utf-8'

In [13]:
# Setting the path for joining multiple files
path = 'C:\\dataouluk2022\\datasets\\onnettomuus\\db\\'
# List of merged files returned
files = glob.glob(path + src_file)
#files

### Merge selected files

In [14]:
li = []
rows = []
for filename in files:
    df = pd.read_csv(filename, delimiter = src_delimiter, index_col=None, header=0)
    ro = df[id_col].count()
    rows.append({'Filename':filename.replace(path,''),'Rows':ro})
    li.append(df)
    
df_rows = pd.DataFrame(rows)
df_file = pd.concat(li, axis=0, ignore_index=True)

#### Check the source and output file rows

In [15]:
# Source files and rows
df_rows

Unnamed: 0,Filename,Rows
0,tielonnett_2005_alue.csv,432
1,tielonnett_2006_alue.csv,439
2,tielonnett_2007_alue.csv,407
3,tielonnett_2008_alue.csv,418
4,tielonnett_2009_alue.csv,669
5,tielonnett_2010_alue.csv,656
6,tielonnett_2011_alue.csv,660
7,tielonnett_2012_alue.csv,649
8,tielonnett_2013_alue.csv,670
9,tielonnett_2014_alue.csv,313


In [16]:
# Sum of source file rows
sum(df_rows['Rows'])

7639

In [17]:
# Result file and rows
df_file

Unnamed: 0,alue_id,ELY,Elynimi,Poliisipri,Piirinimi,Maakunta,Maakuntsel,Kunta,Kuntasel
0,16070191,1,Uusimaa,6070,Helsinki,1,Uusimaa,91,Helsinki
1,16010149,1,Uusimaa,6010,Espoo,1,Uusimaa,49,Espoo
2,160101235,1,Uusimaa,6010,Espoo,1,Uusimaa,235,Kauniainen
3,160101257,1,Uusimaa,6010,Espoo,1,Uusimaa,257,Kirkkonummi
4,168601257,1,Uusimaa,6860,Vihti,1,Uusimaa,257,Kirkkonummi
...,...,...,...,...,...,...,...,...,...
7634,155001444,1,Uudenmaan ELY,5500,Helsingin poliisilaitos,1,Uusimaa,444,Lohja
7635,10571015231,10,Etelä-Pohjanmaan ELY,5710,Pohjanmaan poliisilaitos,15,Pohjanmaa,231,Kaskinen
7636,8574011921,8,Pohjois-Savon ELY,5740,Itä-Suomen poliisilaitos,11,Pohjois-Savo,921,Vesanto
7637,155307398,1,Uudenmaan ELY,5530,Länsi-Uudenmaan poliisilaitos,7,Päijät-Häme,398,Lahti


#### Drop duplicate rows (if needed)

In [18]:
errors = []
errors = df_file[(df_file['ELY'] < 0)]

In [19]:
df_err = pd.DataFrame(errors)
df_err

Unnamed: 0,alue_id,ELY,Elynimi,Poliisipri,Piirinimi,Maakunta,Maakuntsel,Kunta,Kuntasel


In [20]:
#'''
df_file = df_file.sort_values(by='alue_id', ascending=True)
df_file = df_file.drop_duplicates(subset='alue_id', keep="first")
df_file
#'''

Unnamed: 0,alue_id,ELY,Elynimi,Poliisipri,Piirinimi,Maakunta,Maakuntsel,Kunta,Kuntasel
5305,192,0,Tiepiiri tuntematon,0,Ei arvoa,1,Uusimaa,92,Vantaa
5170,798,0,Tiepiiri tuntematon,0,Ei arvoa,7,Päijät-Häme,98,Hollola
5162,2423,0,Tiepiiri tuntematon,0,Ei arvoa,2,Varsinais-Suomi,423,Lieto
5231,4609,0,Tiepiiri tuntematon,0,Ei arvoa,4,Satakunta,609,Pori
5168,4783,0,Tiepiiri tuntematon,0,Ei arvoa,4,Satakunta,783,Säkylä
...,...,...,...,...,...,...,...,...,...
2451,14869019845,14,Lappi,8690,Peräpohjolan poliisilaitos,19,Lappi,845,Tervola
4551,14869019851,14,Lappi,8690,Peräpohjolan poliisilaitos,19,Lappi,851,Tornio
3896,14869019854,14,Lappi,8690,Peräpohjolan poliisilaitos,19,Lappi,854,Pello
3234,14869019976,14,Lappi,8690,Peräpohjolan poliisilaitos,19,Lappi,976,Ylitornio


In [21]:
# Create result csv file and save it
df_file.to_csv(result_file, sep=rslt_delimiter, encoding=rslt_encoding, index=False)