In [1]:
import luigi
import pandas as pd
import os
import logging



Création du dossier pour stocker les données nétoyer

In [2]:
# Vérifiez si le dossier n'existe pas déjà

import shutil


dossier = 'CleanData'

if not os.path.exists(dossier):
    # Créez le dossier
    os.makedirs(dossier, mode=0o777)
    print("Dossier créé avec succès.")


In [3]:
import sys

class ReadDataCanton(luigi.Task):
    file_path = luigi.Parameter()


    def output(self):
        return luigi.LocalTarget("./CleanData/canton.csv")

    def run(self):
        
        df = pd.read_csv(self.file_path, sep=";")
        cantons = pd.DataFrame(columns=df.columns[0:25])
        df_partis = pd.read_csv('./RawData/Partis.csv', sep=",")

        for index, row in df.iterrows():
            #print(f"index : {index} \n row : {row[0:18]} \n\n")
            row1 = pd.Series(row[0:18], index=df.columns[0:18])

            for i in range(0, 10):
                # Ligne 2
                row2 = pd.Series(row.iloc[18 + i * 7:18 + ((i + 1) * 7)])

                df_row2 = pd.DataFrame([row2])
                
                df_row2_renamed = pd.DataFrame(df_row2.values, columns=['N°Panneau','Sexe','Nom','Prenom','Voix','% Voix/Ins','% Voix/Exp'])

                # Fusionner les deux lignes
                merged_row = pd.concat([row1, df_row2_renamed.squeeze()], axis=0)

                # Convertir la ligne fusionnée en DataFrame avec une seule ligne
                merged_row_df = pd.DataFrame([merged_row])

                # Ajouter la ligne fusionnée au DataFrame
                cantons = pd.concat([cantons, merged_row_df], ignore_index=True)

                #print(len(cantons))

        cantons = pd.merge(cantons, df_partis, on=['Nom', 'Prenom'], how='left')

        print(cantons.duplicated(['Nom', 'Prenom']).sum())
        print(df_partis.duplicated(['Nom', 'Prenom']).sum())

        #display(cantons)
        cantons.to_csv(self.output().path, index=False)




In [4]:
class ExtractCandidats(luigi.Task):
    file_path = luigi.Parameter()

    def requires(self):
        return ReadDataCanton(file_path="./RawData/canton_t1_2017.csv")
    
    def output(self):
        return luigi.LocalTarget("./CleanData/candidats.csv")

    def run(self):
                
        df = pd.read_csv(self.file_path, sep=",")
       
        display(df);

        df = df.loc[:, ["Prenom", "Nom"]]
        df = df.drop_duplicates()
        
        df.to_csv(self.output().path, index=False)

In [5]:
class ExtractDelinquance(luigi.Task):
    file_path = luigi.Parameter()

    def output(self):
        return luigi.LocalTarget("./CleanData/delinquance.csv")

    def run(self):
                        
        df = pd.read_csv(self.file_path, sep=";")
        
        df = df.query('annee == 17')

        df.to_csv(self.output().path, index=False)

In [6]:
class ExtractNombreChomeur(luigi.Task):
    file_path = luigi.Parameter()

    def output(self):
        return luigi.LocalTarget("./CleanData/nombre_chomeurs_departement.csv")

    def run(self):      
        df = pd.read_csv(self.file_path, sep=";")
        df = df.rename(columns={"Mois" : "CodeDepartement"})

        # Supprimer la ligne "Total"
        df = df.drop(df.count() - 1)

        # Récupérer le code du département
        df['CodeDepartement'] = df['CodeDepartement'].str.split(' ').str[-1]

        # Supprimer les colonnes inutiles
        condition = (df.columns.str.contains("17") | (df.columns == 'CodeDepartement'))
        df = df.loc[:, condition]

        # Calculer une moyenne du nombre de demandeurs d'emplois par année et non plus par mois
        df_numeric = df.loc[:, df.columns.str.contains("17")]
        df_numeric = df_numeric.apply(lambda x: x.str.replace(' ', '')).astype(float)
        df_moyenne = pd.DataFrame(df_numeric.mean(axis=1), columns=['Moyenne'])
        df = pd.concat([df, df_moyenne], axis=1)
        
        df.to_csv(self.output().path, index=False, sep=";")

In [None]:
class ReadAllData(luigi.Task):
    def requires(self):
        return [ExtractCandidats(file_path="./CleanData/canton.csv"), 
                ExtractDelinquance(file_path="./RawData/Donnees_delinquance.csv"),
                ExtractNombreChomeur(file_path="./RawData/Demandeurs d’emploi inscrits en fin de mois à Pôle emploi par Département.csv")]
    
    def run(self):
        print("lancement")

    def output(self):
        return luigi.LocalTarget('result.txt')

In [7]:

config = luigi.configuration.get_config()
config.set('core', 'no_lock', 'False')

dossier = "./CleanData/"
Restart = True

if Restart is True:
    # Parcourir tous les fichiers du dossier
    for fichier in os.listdir(dossier):
        chemin_fichier = os.path.join(dossier, fichier)
        # Supprimer le fichier
        os.remove(chemin_fichier)

luigi.build([ReadAllData()], local_scheduler=False, no_lock=True)

DEBUG: Checking if ReadAllData() is complete
DEBUG: Checking if ExtractCandidats(file_path=./CleanData/canton.csv) is complete
DEBUG: Checking if ExtractDelinquance(file_path=./RawData/Donnees_delinquance.csv) is complete
INFO: Informed scheduler that task   ReadAllData__99914b932b   has status   PENDING
INFO: Informed scheduler that task   ExtractDelinquance___RawData_Donnee_8bfbeed715   has status   PENDING
DEBUG: Checking if ReadDataCanton(file_path=./RawData/canton_t1_2017.csv) is complete
INFO: Informed scheduler that task   ExtractCandidats___CleanData_cant_f691398620   has status   PENDING
INFO: Informed scheduler that task   ReadDataCanton___RawData_canton_95a4352c04   has status   PENDING
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 4
INFO: [pid 16004] Worker Worker(salt=4374408473, workers=1, host=DESKTOP-PBH195U, username=Marin, pid=16004) running   ExtractDelinquance(file_path=./RawData/Donnees_d

0       16
1       16
2       16
3       16
4       16
        ..
8479    22
8480    22
8481    22
8482    22
8483    22
Name: annee, Length: 8484, dtype: int64

Unnamed: 0,classe,annee,Code.département,Code.région,unité.de.compte,millPOP,millLOG,faits,POP,LOG,tauxpourmille
101,Coups et blessures volontaires,17,01,84,victime,17,17,1464,643350,3130614522700008,227558871531825568
102,Coups et blessures volontaires,17,02,32,victime,17,17,1866,534490,2655483196165975,349117850661378126
103,Coups et blessures volontaires,17,03,84,victime,17,17,1013,337988,2077246632561111,299714782773352928
104,Coups et blessures volontaires,17,04,93,victime,17,17,521,163915,1279404230924362,317847664948296371
105,Coups et blessures volontaires,17,05,93,victime,17,17,381,141284,1354926309531483,269669601653407343
...,...,...,...,...,...,...,...,...,...,...,...
7974,Destructions et dégradations volontaires,17,971,1,infraction,17,17,2871,390253,2275253489634245,735676599539273202
7975,Destructions et dégradations volontaires,17,972,2,infraction,17,17,2309,372594,2125771492133937,619709388771692460
7976,Destructions et dégradations volontaires,17,973,3,infraction,17,17,1596,268700,889864502167438,593970971343505827
7977,Destructions et dégradations volontaires,17,974,4,infraction,17,17,5359,853659,3698945213297095,627768230640103386


INFO: [pid 16004] Worker Worker(salt=4374408473, workers=1, host=DESKTOP-PBH195U, username=Marin, pid=16004) done      ExtractDelinquance(file_path=./RawData/Donnees_delinquance.csv)
DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   ExtractDelinquance___RawData_Donnee_8bfbeed715   has status   DONE
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 3
INFO: [pid 16004] Worker Worker(salt=4374408473, workers=1, host=DESKTOP-PBH195U, username=Marin, pid=16004) running   ReadDataCanton(file_path=./RawData/canton_t1_2017.csv)
