### Add an ID column to the products, accounts, and sales_teams CSV files

In [1]:
import pandas as pd
import numpy as np

# folder paths
original_files_folder_path = "C:\\DataFiles\\CRM_sales\\CSV files\\original_files\\"
modified_files_folder_path = "C:\\DataFiles\\CRM_sales\\CSV files\\modified_files\\"

# list of CSV file names
csv_file_names = ['accounts.csv',
                  'products.csv',
                  'sales_teams.csv']

for file in csv_file_names:
    open_path = original_files_folder_path + file
    df = pd.read_csv(open_path)
    
    # define values for the new "id" column
    max_idx = df.index.max() + 1
    id_values = np.arange(1, max_idx + 1, 1)

    # add new "id" column to the dataframe
    df.insert(loc=0, column='id', value=id_values)
    print(f'{file} information: ')
    df.info()

    # save dataframe back to the CSV file
    save_path = modified_files_folder_path + file
    df.to_csv(save_path, index=False)

accounts.csv information: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                85 non-null     int32  
 1   account           85 non-null     object 
 2   sector            85 non-null     object 
 3   year_established  85 non-null     int64  
 4   revenue           85 non-null     float64
 5   employees         85 non-null     int64  
 6   office_location   85 non-null     object 
 7   subsidiary_of     15 non-null     object 
dtypes: float64(1), int32(1), int64(2), object(4)
memory usage: 5.1+ KB
products.csv information: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           7 non-null      int32 
 1   product      7 non-null      object
 2   series       7 non-null      obje

### Create a dictionary with ID and full name for the products, accounts, and sales_teams CSV files
Goal: Prepare the data to replace records in sales_pipeline.csv

In [2]:
import csv

# dictionary names for each CSV file
dict_names = ['accounts_dict',
              'products_dict',
              'teams_dict']

for i in range(len(csv_file_names)):
    
    file = csv_file_names[i]
    dict_name = dict_names[i]
    
    #read modified csv files (with id's)
    with open(modified_files_folder_path + file) as csvfile:
        reader = csv.reader(csvfile)
        
        # skip header
        next(reader, None)
        
        temporary_dict = {}

        for row in reader:
            # key: full name; value: assigned ID
            temporary_dict[row[1]] = row[0]
        
        # save temporary dictionary data to the final dictionary
        globals()[dict_name] = temporary_dict

print(f'Dictionary example: {products_dict}.')

Dictionary example: {'GTX Basic': '1', 'GTX Pro': '2', 'MG Special': '3', 'MG Advanced': '4', 'GTX Plus Pro': '5', 'GTX Plus Basic': '6', 'GTK 500': '7'}.


### Check for values in the dictionary that do not match the full names in sales_pipeline.csv

In [3]:
sales_pipeline_path = f'{original_files_folder_path}sales_pipeline.csv'
sales_df = pd.read_csv(sales_pipeline_path)

# sales_pipeline column and corresponding dictionary
data = {'product': products_dict,
        'account': accounts_dict,
        'sales_agent': teams_dict}

for column, dict in data.items():
    print(f'Sales_pipeline column - {column}:')

    # check for non matching values in sales_pipeline column
    x = sales_df[~sales_df[column].isin(dict.keys())][column].unique()
    
    if len(x) != 0:
        print(f'Non matching values in sales_pipeline column: {x}.')        

        #check for non matching values in the dictionary 
        for keys in dict.keys():
            y = sales_df[column].unique()
            if keys not in y:
                print(f'Non matching values in the dictionary: {keys}.')
    
    else:
        print(f'No missing values in the dictionary.')

Sales_pipeline column - product:
Non matching values in sales_pipeline column: ['GTXPro'].
Non matching values in the dictionary: GTX Pro.
Sales_pipeline column - account:
Non matching values in sales_pipeline column: [nan].
Sales_pipeline column - sales_agent:
No missing values in the dictionary.


Conclusion: there's a misspelled product name 'GTX Pro' in the dictionary.

### Modify misspelled values in the products dictionary

In [4]:
# replace 'GTX Pro' to 'GTXPro'
products_dict['GTXPro'] = products_dict['GTX Pro']

del products_dict['GTX Pro']

### Replace full names in sales_pipeline.csv with IDs

In [5]:
for column, dict in data.items():
    sales_df[column] = sales_df[column].map(dict)

In [6]:
#preview of the modified sales_pipeline dataframe
sales_df

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value
0,1C1I7A6R,5,6,9,Won,2016-10-20,2017-03-01,1054.0
1,Z063OYW0,10,2,39,Won,2016-10-25,2017-03-11,4514.0
2,EC4QE1BX,10,3,9,Won,2016-10-25,2017-03-07,50.0
3,MV1LWRNH,5,1,11,Won,2016-10-25,2017-03-09,588.0
4,PE84CX4O,33,1,35,Won,2016-10-25,2017-03-02,517.0
...,...,...,...,...,...,...,...,...
8795,9MIWFW5J,3,4,,Prospecting,,,
8796,6SLKZ8FI,3,4,,Prospecting,,,
8797,LIB4KUZJ,3,4,,Prospecting,,,
8798,18IUIUK0,3,4,,Prospecting,,,


In [7]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8800 entries, 0 to 8799
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   opportunity_id  8800 non-null   object 
 1   sales_agent     8800 non-null   object 
 2   product         8800 non-null   object 
 3   account         7375 non-null   object 
 4   deal_stage      8800 non-null   object 
 5   engage_date     8300 non-null   object 
 6   close_date      6711 non-null   object 
 7   close_value     6711 non-null   float64
dtypes: float64(1), object(7)
memory usage: 550.1+ KB


### Save changes to sales_pipeline.csv

In [8]:
sales_df.to_csv(f"{modified_files_folder_path}sales_pipeline.csv", index=False)