In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, isdir, join

In [2]:
folder_theme = {"CC": "cloud_cover_", "QQ": "global_radiation_", "TX": "max_temp_", "FX": "wind_gust_", \
                "TG": "mean_temp_", "FG": "mean_wind_speed_", "SS": "sunshine_duration_", "DD": "wind_direction_"}

In [3]:
df= pd.DataFrame()

In [4]:
current_path = "test_datasets/Daily global radiation QQ "
folders = [f for f in listdir(current_path) if isdir(join(current_path, f))]

In [5]:
only_files = [f for f in listdir(current_path) if isfile(join(current_path, f))]

In [7]:
def clean_dataframe(df, col_name):
    """Clean the dataset, remove unneeded columns, NaNs, strange values..."""
    # remove columns full of NaNs, corresponding to several spaces in the txt file
    df = df.dropna(axis=1, how='all')
    # rename them
    df = df.rename(columns={2: "date", 3: col_name})
    # remove comas
    try:
        df[col_name] = df[col_name].str.replace(",", "")
    except:
        pass
    # drop lines with nan
    df = df.dropna()
    # convert type as int
    df[col_name] = df[col_name].astype(int)
    # keep interesting lines & cols
    df = df[df[col_name] != -9999]
    df = df[["date", col_name]]
    # convert into date
    #df.date = df.date.apply(lambda s: s.split(',')[1])
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    return df

In [10]:
# initialize an empty DF
df= pd.DataFrame()

for fn in only_files:
    # set column name based on file name
    column_name = folder_theme[fn[:2]] + fn[8:-3]
    # create  and format DF
    df_temp = pd.read_csv(join(current_path, fn), header=None)
    df_temp = clean_dataframe(df_temp, column_name)
    # 1st time copy DF in empty one    
    if df.empty:
        df = df_temp.copy()
    # otherwise merge 2 DF
    else:
        df = pd.merge(df, df_temp, how='outer')

df = df.sort_values(by='date')
df.head()

Unnamed: 0,date,global_radiation_000029.,global_radiation_000034.,global_radiation_000012.,global_radiation_000013.,global_radiation_000014.,global_radiation_000021.,global_radiation_000016.,global_radiation_000032.,global_radiation_000033.,global_radiation_000017.,global_radiation_000028.,global_radiation_000015.
15220,1964-01-02,,,61.0,,41.0,44.0,17.0,,,48.0,5.0,87.0
15221,1964-01-03,,,58.0,,68.0,48.0,47.0,,,44.0,3.0,88.0
15222,1964-01-04,,,56.0,,70.0,60.0,47.0,,,18.0,11.0,90.0
15223,1964-01-05,,,44.0,,70.0,39.0,60.0,,,9.0,24.0,91.0
15224,1964-01-06,,,37.0,,22.0,9.0,7.0,,,21.0,7.0,91.0


In [11]:
df.tail()

Unnamed: 0,date,global_radiation_000029.,global_radiation_000034.,global_radiation_000012.,global_radiation_000013.,global_radiation_000014.,global_radiation_000021.,global_radiation_000016.,global_radiation_000032.,global_radiation_000033.,global_radiation_000017.,global_radiation_000028.,global_radiation_000015.
15216,2019-09-26,,108.0,,,,,,85.0,159.0,10.0,,
15217,2019-09-27,,149.0,,,,,,108.0,181.0,19.0,,
20360,2019-09-28,,,,,,,,,,9.0,,
15218,2019-09-29,,184.0,,,,,,147.0,188.0,5.0,,
15219,2019-09-30,,108.0,,,,,,95.0,143.0,15.0,,
