# Imports

In [None]:
import os
from pathlib import Path
import pandas as pd
import sys
sys.path.append('..')
from vars import *
from fuzzywuzzy import process, fuzz
import re

# Read Mapping System

In [None]:
ms_df = pd.read_excel(MAPPING_SYSTEM_PATH, sheet_name='Vehicle Part Mapping System', header=1)
ms_df.head()

In [None]:
MAKES = ms_df['Make'].unique()
MODELS = ms_df['Model'].unique()

# Read XLSX data

1.  Read xlsx files
2.  remove re-work dataframes that have more than 12 columns
3.  create one long dataframe
4.  rename columns

In [None]:
parent = BASE_DIR / 'data' / 'processed'
files = os.listdir(parent)
for file in files:
    if not '.xlsx' in file:
        files.remove(file)

files

In [None]:
dfs = []
for file in files:
    xls = pd.ExcelFile(os.path.join(parent, file), engine='openpyxl')
    for sheet in range(10):
        dfs.append(pd.read_excel(
            xls,
            'Sheet{}'.format(sheet+1),
            skiprows=3,
            skipfooter=4,
        ))

In [None]:
dfs[0].head()

In [None]:
fdfs = []
for df in dfs:
    if df.shape[1] > 12:
        df['Mounting information.1'] = df['Mounting information.1'].combine_first(df['Unnamed: 12'])
        df = df.drop(columns='Unnamed: 12')
        # display(df)
    fdfs.append(df)

In [None]:
df = pd.concat(fdfs)

In [None]:
df

In [None]:
column_headings = [
    'Year',
    'Application',
    'Front Entry Shock',
    'Front Premium Shock',
    'Front Springs',
    'Front Protection kits/ Mounting kits',
    'Front Mounting information',
    'Rear Entry Shock',
    'Rear Premium Shock',
    'Rear Springs',
    'Rear Protection kits/ Mounting kits',
    'Rear Mounting information',
]

In [None]:
df = df.rename(columns={
    'Year': 'Year',
    'Application': 'Application',
    'Entry\nshocks': 'Front Entry Shock',
    'Premium\nshocks': 'Front Premium Shock',
    'Springs': 'Front Springs',
    'Protection\nkits/\nMounting\nkits': 'Front Protection kits/ Mounting kits',
    'Mounting information': 'Front Mounting information',
    'Entry\nshocks.1': 'Rear Entry Shock',
    'Premium\nshocks.1': 'Rear Premium Shock',
    'Springs.1': 'Rear Springs',
    'Protection\nkits/\nMounting\nkits.1': 'Rear Protection kits/ Mounting kits',
    'Mounting information.1': 'Rear Mounting information',
})[column_headings]

# Clean dataframe

In [None]:
df.Year.fillna(method='ffill', inplace=True)

In [None]:
if []:
    display(True)

In [None]:
no_match_found = []
make_matches = {}
model_matches = {}
for i, row in df.iterrows():
    if '-' not in row['Year']:
        row_make_or_model = row['Year'].rstrip(' continued')
        row_match = process.extractOne(row_make_or_model, MAKES, score_cutoff=85)
        if row_match:
            make_matches[row_match[0]] = row_make_or_model
        else:
            row_match = process.extractOne(row_make_or_model, MODELS, score_cutoff=85)
            if row_match:
                model_matches[row_match[0]] = row_make_or_model
            else:
                no_match_found.append(row_make_or_model)
    

In [None]:
len(no_match_found)

In [None]:
process.extract('RAV4', ['RAV'])

In [None]:
no_match_found

In [None]:
len(make_matches)

In [None]:
len(model_matches)

In [None]:
def get_make_or_model(row, makes_or_models, model=True, check_col=False):
    if '-' not in row['Year']:
        matches = []
        row_make_or_model = row['Year'].rstrip(' continued')
        if model:
            row_make_or_model = row_make_or_model.rstrip(' K').rstrip(' H').rstrip(' S').rstrip(' W').rstrip(' F')
        # for make_or_model in makes_or_models:
        #     if make_or_model in row_make_or_model.upper():
        #         matches.append(make_or_model)
        #     else:
        #         print('Row make or model not found:', row_make_or_model)
        #         print(row_make_or_model, process.extractBests(row_make_or_model, makes_or_models, score_cutoff=70))

        row_match = process.extractOne(row_make_or_model, makes_or_models, score_cutoff=85)
        if row_match:
            best_match = process.extractOne(row_make_or_model, makes_or_models)
            if check_col and best_match[1] < 90:
                # print('Low match:', row['Year'])
                return 'Check'
            elif check_col:
                return 'In DB'
            return row_match[0]
        # else:
        #     print('No match:', row_make_or_model)

        # else:
            # print('No matches:', row['Year'])
    
    return None

In [None]:
new_df = df.copy()

In [None]:
df['Make'] = df.apply(lambda row: get_make_or_model(row, MAKES, model=False), axis=1)
df['Model'] = df.apply(lambda row: get_make_or_model(row, MODELS, model=True), axis=1)
df['Check Model'] = df.apply(lambda row: get_make_or_model(row, MODELS, model=True, check_col=True), axis=1)

In [None]:
df.Model.fillna(method='ffill', inplace=True)
df.Make.fillna(method='ffill', inplace=True)
df['Check Model'].fillna(method='ffill', inplace=True)

In [None]:
df.shape

In [None]:
df.dropna(
    subset=[
        'Front Entry Shock',
        'Front Premium Shock',
        'Front Springs',
        'Front Protection kits/ Mounting kits',
        'Front Mounting information',
        'Rear Entry Shock',
        'Rear Premium Shock',
        'Rear Springs',
        'Rear Protection kits/ Mounting kits',
        'Rear Mounting information'
    ],
    how='all',
    inplace=True
)

In [None]:
df.shape

In [None]:
df = df.assign(Year=df['Year'].str.split('/')).explode('Year')

In [None]:
def get_start_or_end_year(row, start_year=True):
    """
    Returns either the starting year or ending. If `start_year` is set to True then the starting
    year is returned, otherwise the ending year is returned.
    """
    if start_year:
        i = 0
    else:
        i = 1
    
    _year = str(row['Year'])
    _year = _year.replace('only', '')

    if '-' in _year:
        try:
            _year = _year.split('-')[i]

        except IndexError:
            if not start_year:
                return row['Start Year']
            else:
                return None

    if 'All' in _year:
        if start_year:
            _year = 1900
        else:
            _year = 'on'

    elif 'on' not in _year:
        _year = re.sub('\D', '', _year)
        try:
            _year = int(_year)
            if _year < 30:
                _year += 2000
            else:
                _year += 1900
        except ValueError:
            print('Value error with year:', row['Year'])

    if _year == '':
        print('Year:', row['Year'])

    if isinstance(_year, int):
        if _year > 2021:
            _year = ''

    elif isinstance(_year, str):
        if _year != 'on':
            _year = ''
        
    return _year

In [None]:
df['Start Year'] = df.apply(lambda row: get_start_or_end_year(row), axis=1)
df['End Year'] = df.apply(lambda row: get_start_or_end_year(row, start_year=False), axis=1)

In [None]:
df.head()

In [None]:
def extract_incl_derivatives(row):
    deriv = str(row.Application).replace(str(row.Model).replace('/', '&'), '').strip()
    init_deriv = deriv.replace('\r', '')
    deriv = init_deriv
    if '(EXCL' in deriv.upper():
        deriv_i = deriv.upper().find('(EXCL')
        deriv = deriv[:deriv_i].strip()
        
    # print('Deriv before', deriv)
    if '(INCL' in init_deriv.upper():
        # print('Incl found in', init_deriv)
        other_includes_i = init_deriv.upper().find('(INCL')
        deriv = deriv[:other_includes_i]
        deriv = [x.strip() for x in re.split(',|&', deriv)]
        # print('Deriv after', deriv)
        other_includes = init_deriv[other_includes_i:].strip()
        # other_includes = other_includes[other_includes_i:].strip()
        other_includes = re.sub('\(incl|\(INCL\.*|\)', '', other_includes)
        other_includes = [x.strip() for x in re.split(',|&|\/', other_includes)]
        # print('Other includes:', other_includes)
        deriv.extend(other_includes)
    else:
        deriv = [x.strip() for x in re.split(',|&|\/', deriv)]

    if isinstance(deriv, list):
        if len(list(filter(None, deriv))) == 0:
            return ''
        else:
            return deriv
    else:
        return ''

In [None]:
def extract_excl_derivatives(row):
    deriv = str(row.Application).replace(str(row.Model).replace('/', '&'), '').strip()
    init_deriv = deriv.replace('\r', '')
    deriv = init_deriv
    if '(EXCL' in deriv.upper():
        deriv_i = deriv.upper().find('(EXCL')
        deriv = deriv[deriv_i:].strip()
        deriv = re.sub('\(excl|\(EXCL\.*|\)', '', deriv)
        deriv = [x.strip() for x in re.split(',|&|\/', deriv)]
    else:
        return ''

    return deriv

In [None]:
df['Include Derivatives'] = df.apply(lambda row: extract_incl_derivatives(row), axis=1)
df['Exclude Derivatives'] = df.apply(lambda row: extract_excl_derivatives(row), axis=1)

# Clean up database

## Remove funny characters from parts

In [None]:
def remove_funnies(row, col):
    value = row[col]
    if isinstance(value, str):
        value = re.sub('\r', ' ', value)
        value = re.sub(r'\“*|\”*', '', value)
        return value
    else:
        return row[col]

In [None]:
part_cols = [
    'Front Entry Shock',
    'Front Premium Shock',
    'Front Springs',
    'Front Protection kits/ Mounting kits',
    'Front Mounting information',
    'Rear Entry Shock',
    'Rear Premium Shock',
    'Rear Springs',
    'Rear Protection kits/ Mounting kits',
    'Rear Mounting information'
]

for col in part_cols:
    df[col] = df.apply(lambda row: remove_funnies(row, col), axis=1)

In [None]:
def extract_with_regex(row, col, search_pattern, sub_pattern):
    value = row[col]
    if value and isinstance(value, str):
        if re.findall(search_pattern, value):
            # value = re.sub(r'\“*|\”*', '', value)
            return re.sub(sub_pattern, '', value)

    return ''

In [None]:
df['Rear Protection kits'] = df.apply(lambda row: extract_with_regex(row, 'Rear Protection kits/ Mounting kits', r'PK\d*', r'\/\s*MK\d*\s*'), axis=1)
df['Rear Mounting kits'] = df.apply(lambda row: extract_with_regex(row, 'Rear Protection kits/ Mounting kits', r'MK\d*', r'\“*PK\d*\”*\s*\/*\s*'), axis=1)
df['Front Protection kits'] = df.apply(lambda row: extract_with_regex(row, 'Front Protection kits/ Mounting kits', r'PK\d*', r'\/\s*MK\d*\s*'), axis=1)
df['Front Mounting kits'] = df.apply(lambda row: extract_with_regex(row, 'Front Protection kits/ Mounting kits', r'MK\d*', r'\“*PK\d*\”*\s*\/*\s*'), axis=1)

In [None]:
fdf = df[[
    'Make',
    'Model',
    'Check Model',
    'Start Year',
    'End Year',
    'Include Derivatives',
    'Exclude Derivatives',
    'Front Entry Shock',
    'Front Premium Shock',
    'Front Springs',
    'Front Protection kits',
    'Front Mounting kits',
    'Front Mounting information',
    'Rear Entry Shock',
    'Rear Premium Shock',
    'Rear Springs',
    'Rear Protection kits',
    'Rear Mounting kits',
    'Rear Mounting information',
]]

In [None]:
fdf = fdf.assign(Model=fdf['Model'].str.split('/')).explode('Model')

# Match with mapping system

In [None]:
def match_cols(x, row):

    if row['End Year'] == 'on':
        end_year = datetime.datetime.now().year
    else:
        end_year = row['End Year']

    try:
        if isinstance(end_year, str):
            end_year = int(end_year)
    except ValueError:
        print(row)
        return False

    fill = False

    if x.Make == row.Make and\
        x.Model == row.Model and\
            x['Model Year'] > row['Start Year'] and\
                x['Model Year'] < end_year:
                fill = True
                incl_derivs = row['Include Derivatives']
                excl_derivs = row['Exclude Derivatives']
                if incl_derivs:
                    # print('Include Derivs:', incl_derivs)
                    # print( x['Derivative'])
                    if not any(fuzz.WRatio(deriv.upper(), x['Derivative']) >= 90 for deriv in incl_derivs):
                        fill = False
                if excl_derivs:
                    # print('Exclude Derivs', excl_derivs)
                    # print( x['Derivative'])
                    if any(fuzz.WRatio(deriv.upper(), x['Derivative']) >= 90 for deriv in excl_derivs):
                        fill = False
    
    if fill:
        return True
    
    return False

In [None]:
def derivative_match(x, row, include=True):
    if include:
        in_or_ex = 'Include'
    else:
        in_or_ex = 'Exclude'
    incl_derivs = row[f'{in_or_ex} Derivatives']
    for deriv in incl_derivs:
        # print(x.keys())
        if fuzz.WRatio(deriv, x['Derivative']) >= 90:
            return deriv
    return ''

In [None]:
column_mapping = {
    'Front Left': 'Front Entry Shock',
    'Front Right': 'Front Entry Shock',
    'Protection Kits Front': 'Front Protection kits',
    'Mounting Kits Front': 'Front Mounting kits',
    'Rear Left': 'Rear Entry Shock',
    'Rear Right ': 'Rear Entry Shock',
    'Protection Kits Rear': 'Rear Protection kits',
    'Mounting Kits Rear': 'Rear Mounting kits',
    'Front Left 1': 'Front Premium Shock',
    'Front Right 1': 'Front Premium Shock',
    'Rear Left    1': 'Rear Premium Shock',
    'Rear Right 1': 'Rear Premium Shock',
}

In [None]:
for index, row in fdf.iterrows():

    indexes = ms_df[ms_df.apply(lambda x: match_cols(x, row), axis=1)].index
    
    ms_df.loc[indexes, 'Include Derivative'] = ms_df.loc[indexes].apply(lambda x: derivative_match(x, row, include=True), axis=1)
    ms_df.loc[indexes, 'Exclude Derivative'] = ms_df.loc[indexes].apply(lambda x: derivative_match(x, row, include=False), axis=1)

    for key in column_mapping.keys():
        ms_df.loc[indexes, key] = row[column_mapping[key]]