Skip to content

Commit

Permalink
feat: add data selector
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoBousseau committed Jul 27, 2022
1 parent b2f4904 commit 854fd95
Show file tree
Hide file tree
Showing 2 changed files with 351 additions and 0 deletions.
111 changes: 111 additions & 0 deletions src/data_selector/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import click
import src.data_selector.__init__ as init
from src.data_selector.selector import select


@click.group()
def cli():
"""Data selection interactive tool."""
pass


@cli.command
def version():
"""Print the application version information"""
click.echo(init.__version__)


@cli.command()
@click.option(
"-i",
"--input",
"input_file",
type=click.Path(dir_okay=False, file_okay=True, exists=True, readable=True),
required=True,
help="Data file to convert"
)
@click.option(
"-o",
"--output",
"output_file",
type=str,
default="test",
help="Name for the output files"
)
@click.option(
"-f",
"--force",
"overwrite",
type=bool,
is_flag=True,
default=False,
help="Overwrite existing files"
)
@click.option(
"-fi",
"--format_in",
"file_format_in",
type=str,
required=True,
help="File format of the input (csv, json).",
)
@click.option(
"-fo",
"--format_out",
"file_format_out",
type=str,
required=False,
default='csv',
help="File format of the output (csv, json).",
)
@click.option(
"-S",
"--select",
"path_columns_to_keep",
type=str,
required=False,
help="Path to file with columns to keep."
)
@click.option(
"-D",
"--delete",
"path_columns_to_delete",
type=click.Path(),
required=False,
help="Path to file with columns to delete."
)
@click.option(
"-sD",
"--dataColumn",
"path_to_data_and_columns",
type=str,
default=None,
help="Path to file with columns and data to keep."
)
def select_cli(
input_file: str,
output_file: str,
overwrite: bool,
file_format_in: str,
file_format_out: str,
path_columns_to_keep: str,
path_columns_to_delete: str,
path_to_data_and_columns: str,
data_frame=None
):
"""Start service to select Data to Keep/Delete"""
select(
input_file,
output_file,
overwrite,
file_format_in,
file_format_out,
path_columns_to_keep,
path_columns_to_delete,
path_to_data_and_columns,
data_frame
)


if __name__ == "__main__":
cli()
240 changes: 240 additions & 0 deletions src/data_selector/selector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
import pandas as pd
from pandas import DataFrame as df
import os
import json
import warnings
warnings.simplefilter("ignore")


def select(
input_file: str,
output_file: str,
overwrite: bool,
input_format: str,
format_choice: str,
path_columns_to_keep=None,
path_columns_to_delete=None,
path_to_data_and_columns=None,
data_frame=None
) -> None:
"""Documentation:
inputs:
input_file:
This function handles the interaction with the
user for the choices.
"""
if data_frame is None:
data_frame = pd.read_csv(input_file)

if path_columns_to_keep is not None:

with open(path_columns_to_keep) as d:
param_dict = json.load(d)
list_col_names: list[str] = [value for value in param_dict['column_names'].values()]
data_frame = data_frame.reindex(columns=list_col_names)

if path_columns_to_delete is not None:

with open(path_columns_to_delete) as d:
param_dict = json.load(d)
list_col_names = [value for value in param_dict['column_names'].values()]
for col_name in list_col_names:
data_frame = data_frame.drop(columns=[col_name], axis=1)

if path_to_data_and_columns is not None:

with open(path_to_data_and_columns) as d:
param_dict = json.load(d)
data_frame = select_data_and_column(data_frame, param_dict)

save(
data_frame,
output_file,
overwrite,
format_choice,
input_format
)


def save(
data_frame: df,
output_file: str,
overwrite: bool,
format_choice=None,
input_format=None
) -> None:
"""Documentation:
inputs:
output_file: path to the output_file
overwrite: boolean to overwrite existing file
This function saves the file to the specified path.
"""
if overwrite or not os.path.exists(output_file):
if format_choice is None :
format_choice = input_format

if format_choice == "csv":
data_frame.to_csv(output_file, index=False, sep=";")
print("File has been saved. End of the service.")

elif format_choice == "json":
data_frame.to_json(output_file)
print("File has been saved. End of the service.")

elif format_choice == "x":
try:
data_frame.to_excel(output_file, index=False)
print("File has been saved. End of the service.")
except TypeError as e:
raise TypeError("TypeError : " + str(e) + " Wrong output_file. New input path with file : ")

else:
raise ValueError("\nError in the choice of the format. Try again.")

elif os.path.exists(output_file):
raise ValueError(f"{output_file} already exists. Overwrite option set to False. "
+ "Service failed.")


def delete_column(
data_frame: df,
List_of_columns: list[str],
) -> df:
"""Documentation:
inputs:
data_frame: DataFrame of the data to modify.
column_name: name of the column to delete.
This function deletes a column from a DataFrame and
returns the new DataFrame.
"""

df_res: df = data_frame.drop(columns=List_of_columns)
return df_res


def select_column(
data_frame: df,
list_of_column_name: list[str],
) -> df:
"""Documentation:
inputs:
data_frame: DataFrame of the data to modify.
column_name: name of the column to return.
This function selects a column from a DataFrame and
returns it.
"""

try:
df_res: df = data_frame.reindex(columns=list_of_column_name)
return df_res
except ValueError as ve:
raise ValueError("Value Error : " + str(ve))


def select_data_and_column(
data_frame: df,
param_dict: dict
) -> df:
"""Documentation:
inputs:
data_frame: DataFrame of the data to truncate.
column_names: name of the column to truncate from.
This function selects rows from one or more column in a DataFrame and
returns the truncated DataFrame.
"""

data_frame = data_frame.reindex(columns=param_dict["column_names"].keys())
try:
df_res: df = pd.DataFrame()
list_inter_value = []
list_inter_column = []
for column in param_dict['column_names'].keys():
for val in param_dict["column_names"][column]['value']:
list_inter_value.append(data_frame[data_frame[column] == val])
list_inter_column.append(pd.concat(list_inter_value))
list_inter_value = []

df_res = pd.concat(list_inter_column)
return df_res
except KeyError as e:
raise KeyError("KeyError : " + str(e))


def check_name_okay(
name: str,
data_frame: df
) -> bool:
"""Documentation:
inputs:
name: name to check
data_frame: reference dataframe to iterate through
This function verifies if a name is a DataFrame column name.
"""

for col_name in data_frame:
if str(col_name) == name:
return True
return False


def get_name_index(
data_frame: df,
name: str,
) -> int:
"""Documentation:
inputs:
data_frame: DataFrame to analyse.
column_name: name of the column we want the index of.
This function gets the index of the column in the DataFrame.
"""

for i in range(len(data_frame.columns)):
if str(data_frame.columns[i]) == str(name):
return i
elif i == len(data_frame.columns) - 1:
return -1
return -1


def check_name_valid(data_frame: df, name: str, accept_empty: bool) -> bool:
"""Documentation:UU
inputs:
name: name to check
data_frame: reference dataframe to iterate through
This function verifies if a name is a DataFrame column name.
"""

for col_name in data_frame:
if accept_empty:
if (str(col_name) == name) or (name == ""):
return True
else:
if str(col_name) == name:
return True
return False


def handle_name_error(
data_frame: df,
name: str,
accept_empty: bool,
):
"""Documentation:
inputs:
data_frame: DataFrame to analyse.
column_name: name of the column we want the index of.
This function handles the "wrong column name" error.
"""

if not check_name_valid(data_frame, name, accept_empty):
raise ValueError("Column name argument not found.")
return name

0 comments on commit 854fd95

Please sign in to comment.