In [1]:
import os
import pandas as pd

In [2]:
# set directory
directory = 'data'

In [3]:
# initialize empty list
dfs = []

# initialize empty data frame
data = pd.DataFrame()

In [4]:
# iterate through each file and append to data frame
for filename in os.listdir(directory):
    if filename.startswith("TG_STAID"):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r') as file:
            file_content = file.read()
            temp_df = pd.read_csv(filepath, skiprows = 20)
            dfs.append(temp_df)

data = pd.concat(dfs, ignore_index=True)

In [5]:
# return column list
data.columns

Index(['STAID', ' SOUID', '    DATE', '   TG', ' Q_TG'], dtype='object')

In [6]:
# filter and rename columns
data = data[['STAID', '    DATE', '   TG']]
data.columns = ['station_id', 'date', 'temp_c']

data

Unnamed: 0,station_id,date,temp_c
0,6,18600101,9
1,6,18600102,35
2,6,18600103,15
3,6,18600104,24
4,6,18600105,34
...,...,...,...
601272,8,20220527,65
601273,8,20220528,64
601274,8,20220529,55
601275,8,20220530,111


In [7]:
# apply division by 10 transformation
data['temp_c'] = data['temp_c'] / 10
data

Unnamed: 0,station_id,date,temp_c
0,6,18600101,0.9
1,6,18600102,3.5
2,6,18600103,1.5
3,6,18600104,2.4
4,6,18600105,3.4
...,...,...,...
601272,8,20220527,6.5
601273,8,20220528,6.4
601274,8,20220529,5.5
601275,8,20220530,11.1


In [8]:
# apply fahrenheit transformation to new column
data['temp_f'] = data['temp_c'] * (9/5) + 32
data

Unnamed: 0,station_id,date,temp_c,temp_f
0,6,18600101,0.9,33.62
1,6,18600102,3.5,38.30
2,6,18600103,1.5,34.70
3,6,18600104,2.4,36.32
4,6,18600105,3.4,38.12
...,...,...,...,...
601272,8,20220527,6.5,43.70
601273,8,20220528,6.4,43.52
601274,8,20220529,5.5,41.90
601275,8,20220530,11.1,51.98


In [9]:
# write data frame to csv file
data.to_csv('data/cleaned.csv', index = False)