Skip to content

Commit

Permalink
Format with black
Browse files Browse the repository at this point in the history
  • Loading branch information
myslak71 committed Nov 2, 2019
2 parents 936fccd + d10a971 commit feefe3d
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 30 deletions.
4 changes: 3 additions & 1 deletion csv_report_processer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@


def get_parser():
parser = ArgumentParser(description=description, formatter_class=RawDescriptionHelpFormatter)
parser = ArgumentParser(
description=description, formatter_class=RawDescriptionHelpFormatter
)
required = parser.add_argument_group('required arguments')
required.add_argument('-i', '--input', help='Input CSV file path', required=True)
required.add_argument('-o', '--output', help='Output CSV file path', required=True)
Expand Down
114 changes: 85 additions & 29 deletions csv_report_processer/report_processer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@ class ReportProcesser(object):
Has one class attribute: _columns which contains column names to
be used in pandas.DataFrame
"""

_columns = ('date', 'country_code', 'impressions', 'clicks')

def __init__(self):
"""Initialization of the object's DataFrame"""
self.df = pd.DataFrame()

def process_csv_report(self, input_path: str, output_path: str, error_path: str = None):
def process_csv_report(
self, input_path: str, output_path: str, error_path: str = None
):
"""Report processing function.
If possible, converts input file data to specific format and saves to
Expand All @@ -43,31 +46,60 @@ def process_csv_report(self, input_path: str, output_path: str, error_path: str
try:
self._open_report(input_path)
except UnicodeError:
LOGGER.error('Invalid file encoding - supported encoding: UTF-8, UTF-16\nCould not process the file.')
LOGGER.error(
'Invalid file encoding - supported encoding: UTF-8, UTF-16\nCould not process the file.'
)
except FileNotFoundError:
LOGGER.error(f'Input file {input_path} does not exist\nCould not process the file.')
LOGGER.error(
f'Input file {input_path} does not exist\nCould not process the file.'
)
else:
self._convert_data()

df_error = self.df[self.df['error'] == 1]
df_valid = self.df[self.df['error'] != 1].groupby(['date', 'country_code'], as_index=False) \
.agg(self._aggregate_function)
df_valid = (
self.df[self.df['error'] != 1]
.groupby(['date', 'country_code'], as_index=False)
.agg(self._aggregate_function)
)

# concatenate valid data frame with error data frame and save it as CSV file
if df_error.empty or not error_path:
pd.concat([df_valid, df_error]).sort_values(by=['date', 'country_code']) \
.to_csv(output_path, index=False, header=False,
columns=self._columns, line_terminator='\n')
pd.concat([df_valid, df_error]).sort_values(
by=['date', 'country_code']
).to_csv(
output_path,
index=False,
header=False,
columns=self._columns,
line_terminator='\n',
)
word = 'out' if df_error.empty else ''
LOGGER.info(f'File has been converted with{word} errors and saved at {output_path}')
LOGGER.info(
f'File has been converted with{word} errors and saved at {output_path}'
)

else:
df_valid.to_csv(output_path, index=False, header=False,
columns=self._columns, line_terminator='\n')
df_error.to_csv(error_path, index=False, header=False,
columns=self._columns, line_terminator='\n')
LOGGER.info(f'File has been converted with errors and saved at {output_path}')
LOGGER.info(f'Invalid data has been excluded from the result and saved at {error_path}')
df_valid.to_csv(
output_path,
index=False,
header=False,
columns=self._columns,
line_terminator='\n',
)
df_error.to_csv(
error_path,
index=False,
header=False,
columns=self._columns,
line_terminator='\n',
)
LOGGER.info(
f'File has been converted with errors and saved at {output_path}'
)
LOGGER.info(
f'Invalid data has been excluded from the result and saved at {error_path}'
)

@staticmethod
def _aggregate_function(cell: pd.Series) -> np.int64:
Expand Down Expand Up @@ -96,11 +128,22 @@ def _open_report(self, input_path: str):
"""

try:
self.df = pd.read_csv(input_path, names=self._columns, index_col=False,
keep_default_na=False, sep=',')
self.df = pd.read_csv(
input_path,
names=self._columns,
index_col=False,
keep_default_na=False,
sep=',',
)
except UnicodeDecodeError:
self.df = pd.read_csv(input_path, names=self._columns, index_col=False,
keep_default_na=False, sep=',', encoding='utf-16')
self.df = pd.read_csv(
input_path,
names=self._columns,
index_col=False,
keep_default_na=False,
sep=',',
encoding='utf-16',
)

def _convert_data(self):
"""
Expand All @@ -109,32 +152,45 @@ def _convert_data(self):
Tries to convert each cell to corresponding format. If it fails,
changes row 'error' flag to 1.
"""
self.df['country_code'] = self.df['country_code'].apply(self._convert_state_to_country)
self.df['country_code'] = self.df['country_code'].apply(
self._convert_state_to_country
)

self.df['error'] = 0

for row in self.df.itertuples():
# convert date
try:
self.df.at[row.Index, 'date'] = pd.to_datetime(row.date).strftime('%Y-%m-%d')
self.df.at[row.Index, 'date'] = pd.to_datetime(row.date).strftime(
'%Y-%m-%d'
)
except ValueError:
LOGGER.error(
f'Row {row.Index}: Following date could not be converted: {self.df.at[row.Index, "date"]}\n')
f'Row {row.Index}: Following date could not be converted: {self.df.at[row.Index, "date"]}\n'
)
self.df.at[row.Index, 'error'] = 1

# convert impressions and clicks
try:
self.df.at[row.Index, 'impressions'] = int(row.impressions)
self.df.at[row.Index, 'clicks'] = float(str(row.clicks).rstrip('%')) / 100
self.df.at[row.Index, 'clicks'] = round(self.df.at[row.Index, 'clicks'] * int(row.impressions))
self.df.at[row.Index, 'clicks'] = (
float(str(row.clicks).rstrip('%')) / 100
)
self.df.at[row.Index, 'clicks'] = round(
self.df.at[row.Index, 'clicks'] * int(row.impressions)
)
except Exception as e:
if str(e).startswith('invalid literal for int() with base 10: '):
error_message = str(e).replace('invalid literal for int() with base 10: ',
f'Row {row.Index}: Following impression number '
f'could not be converted: ')
error_message = str(e).replace(
'invalid literal for int() with base 10: ',
f'Row {row.Index}: Following impression number '
f'could not be converted: ',
)
else:
error_message = str(e).replace('could not convert string to float: ',
f'Row {row.Index}: Following CTR could not be converted: ')
error_message = str(e).replace(
'could not convert string to float: ',
f'Row {row.Index}: Following CTR could not be converted: ',
)
self.df.at[row.Index, 'error'] = 1
LOGGER.error(error_message)

Expand Down

0 comments on commit feefe3d

Please sign in to comment.