In [103]:
import os
import gzip
import uuid
import datetime
import pandas as pd

from tqdm import tqdm
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from dateutil.parser import parse

es = Elasticsearch([{'host': 'localhost', 'port': 9200}])



In [3]:
def read_one_day(day, directory):
    dir_names = [f for f in os.listdir(directory) if os.path.isdir(os.path.join(directory, f))]
    
    filenames = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    frames = []
    for f_name in filenames:
        if day in f_name:
            file = gzip.open(f"{directory}/{f_name}", "r")
            frames.append(pd.read_csv(file, sep=";"))
    
    df = pd.concat(frames)
    df.Timestamp = pd.to_datetime(df.Timestamp)
    df.index = df.Timestamp
    df.drop(["Timestamp"], inplace=True, axis=1)
    
    # Problèmes avec les données "clean": plusieurs jours d'enregistrements sont contenus dans les fichiers
    return df[df.index.day == datetime.datetime.strptime(day, '%Y-%m-%d').day].sort_index()

In [4]:
def load_data(directory):
    dir_names = [f for f in os.listdir(directory) if os.path.isdir(os.path.join(directory, f))]
    
    dfs = {}
    for d_name in tqdm(dir_names, "Directory"):
        filenames = [f for f in os.listdir(f"{directory}/{d_name}") if os.path.isfile(os.path.join(f"{directory}/{d_name}", f))]
        station = {}
        for f_name in filenames:
            if station.get(f_name[:10]) is None:
                station[f_name[:10]] = []
                
            file = gzip.open(f"{directory}/{d_name}/{f_name}", "r")
            station[f_name[:10]].append(pd.read_csv(file, sep=";"))
        
        station = {date: pd.concat(dfs).to_dict() for date, dfs in station.items()}
        dfs[d_name] = station
        
    return dfs

In [5]:
def load_all(data_dir=data_dir):
    dfs = []
    for d_name in tqdm(os.listdir("./clean/")):
        for f_name in os.listdir('./clean/' + d_name):
            file = gzip.open(os.path.join(data_dir, d_name, f_name), "r")
            dfs.append(pd.read_csv(file, sep=";"))

    return pd.concat(dfs)

In [114]:
try:
    es.indices.delete(index="velos")
except:
    pass
try:
    es.indices.delete(index="velos_parma")
except:
    pass
try:
    es.indices.delete(index="velos_parma_test")
except:
    pass

In [115]:

def load_2016_data():
    datas = []
    data_index= "velos"
    data_dir = "clean/"

    for d_name in tqdm(os.listdir("./clean/")):
        for f_name in os.listdir(os.path.join(data_dir, d_name)):
            file = gzip.open(os.path.join(data_dir, d_name, f_name), "r")
            df = pd.read_csv(file, sep=";")
            df["Timestamp"] = df["Timestamp"].apply(lambda x : parse(x).isoformat())
            #df["Timestamp"] = df["Timestamp"].apply(lambda x : datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            df["Station"] = df["Station"].apply(lambda station : station[4:])
            df.columns = [c.lower() for c in df.columns]
            
            #date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S.%f')
                 
            for _, row in df.iterrows():
                if (row["timestamp"] > parse("2016-01-01 00:00:00").isoformat() and row["timestamp"] < parse("2016-12-31 23:59:59").isoformat()):
                    datas.append('{ "index" : { "_index" : "' + data_index +'" } }')
                    datas.append(row.to_json())
                    if(len(datas) == 1000):
                        es.bulk(datas)
                        datas = []
        break
    if len(datas) > 0:
        es.bulk(datas)
        datas = []

load_2016_data()

















  0%|          | 0/25 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

In [44]:

#dfs_light = dfs[(dfs['Timestamp'] > "2016-01-01 00:00:00") & (dfs['Timestamp'] < "2016-12-31 23:59:59")]
#dfs_light.columns = [c.lower() for c in dfs.columns]


3814238



             Timestamp  Station  Bikes  Slots  Total Status  Humidity  \
0  2015-04-22 11:50:00  01. Duc      2      4      6  clear      68.0   
1  2015-04-22 12:00:00  01. Duc      2      4      6  clear      68.0   
2  2015-04-22 12:10:00  01. Duc      2      4      6  clear      68.0   
0  2017-04-09 01:20:00  01. Duc      7      3     10  Clear      71.0   
0  2015-04-22 07:40:00  01. Duc      3      4      7  clear      53.0   

   Pressure Rain  WindDeg  WindSpeed Snow  TemperatureTemp  
0   1028.64   {}  292.501       2.01   {}            18.22  
1   1028.64   {}  292.501       2.01   {}            18.22  
2   1028.64   {}  292.501       2.01   {}            18.22  
0   1022.00   {}  316.502       0.68   {}            14.00  
0    958.22   {}  202.502       1.06   {}             6.81  



987419



             timestamp  station  bikes  slots  total  status  humidity  \
0  2016-09-22 09:00:00  01. Duc      5      4      9  clouds      26.0   
1  2016-09-22 09:10:00 

In [49]:
#for i, row in zip(range(1), dfs.iterrows()):
#    print(row[1].WindDeg)
    

{'timestamp': '2016-09-22 09:00:00', 'station': '01. Duc', 'bikes': 5, 'slots': 4, 'total': 9, 'status': 'clouds', 'humidity': 26.0, 'pressure': 1021.1, 'rain': '{}', 'winddeg': 233.503, 'windspeed': 1.21, 'snow': '{}', 'temperaturetemp': 16.57}


In [116]:
"""
id = 1
for row in dfs.iterrows():
    for date, df in station_dic.items():
        res = es.index(index=f"velos", id=id, body=df)
        print(res['result'])
        id += 1
"""

def gendata(dfs):
    for index, row in dfs.iterrows():
        for key, item in row[1].to_dict().items():
            if item == "None":
                row[1][key] = 0
                
        yield {
            "_id": row[0],
            "_index": "velos",
            "_type": "_doc",
            "_source": row[1].to_dict(),
        }
