In [19]:
import pandas as pd
import numpy as np
import json

In [20]:
df = pd.read_csv('data/ST02_MC_MLGCS_NextDay_Tech_Desc_Reformatted.csv')
df.shape

(1671, 5)

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,L1 Review,L2 Review,L3: IAR Review,Unnamed: 4
0,TechnicalDescription,L1 Review Comment,L2 Review Comment,Review Comment L3,
1,This table contains details for issue and reso...,,Ok,ok,
2,Surrogate key generated on SupportTopicFullP...,,Ok,ok,
3,Timestamp in UTC at which record is inserted b...,,Ok,ok,
4,Timestamp in UTC at which record is updated by...,,Ok,ok,


In [22]:
df.columns = df.iloc[0]  # use the first row as the new header
df = df[1:]              # drop the first row from the data
df.reset_index(drop=True, inplace=True)

In [23]:
df.columns

Index(['TechnicalDescription',    'L1 Review Comment',    'L2 Review Comment',
          'Review Comment L3',                    nan],
      dtype='object', name=0)

In [24]:
col_to_check = df.columns[4]

# Drop it only if all values are NaN
if df[col_to_check].isna().all():
    df = df.drop(columns=col_to_check)

In [25]:
df = df.dropna(subset=['L1 Review Comment', 'L2 Review Comment', 'Review Comment L3'], how='all').reset_index(drop=True)

In [26]:
cols_to_check = ['TechnicalDescription', 'L1 Review Comment', 'L2 Review Comment', 'Review Comment L3']

df = df[~df.apply(lambda row: list(row.values) == cols_to_check, axis=1)].reset_index(drop=True)

In [27]:
df = df[~df.apply(lambda row: row.astype(str).str.contains('L1 Review', na=False).any(), axis=1)].reset_index(drop=True)

In [28]:
columns_to_check = ["L1 Review Comment", "L2 Review Comment", "Review Comment L3"]

# Function to check if a row should be kept
def valid_row(row):
    # Check if at least one column is NOT 'ok' or NaN
    has_meaningful_comment = any([str(x).strip().lower() != 'ok'and str(x).lower() != 'looks good' and pd.notnull(x) for x in row])
    return has_meaningful_comment

# Apply the filtering function
fail_descriptions = df[df[columns_to_check].apply(valid_row, axis=1)]

# Display filtered DataFrame
print(fail_descriptions.shape)
fail_descriptions.head()

(216, 4)


Unnamed: 0,TechnicalDescription,L1 Review Comment,L2 Review Comment,Review Comment L3
12,This table contains the details for case resol...,,OK,Add the source and source table names and grai...
17,"This table contains Online Safety Team Queues,...",,Ok - I am guessing this I some type of ticket ...,ok
45,This table contains Customer data coming from ...,,Ok,Add the source and source table names and grai...
50,This atrrribte is derived by calculating hex s...,,Ok,It is not a decode logic. change accordingly.\...
56,This is a reference key to DimDate table to ge...,,OK,If it is a direct mapping to Commerical attrib...


In [29]:
# correct descriptions
# Function to check if a row should be kept
def pass_row(row):
    # Check if at least one column is NOT 'ok' or NaN
    has_meaningful_comment = not any([str(x).strip().lower() != 'ok'and str(x).lower() != 'looks good' and pd.notnull(x) for x in row])
    return has_meaningful_comment

# Apply the filtering function
pass_descriptions = df[df[columns_to_check].apply(pass_row, axis=1)]

# Display filtered DataFrame
pass_descriptions

Unnamed: 0,TechnicalDescription,L1 Review Comment,L2 Review Comment,Review Comment L3
0,This table contains details for issue and reso...,,Ok,ok
1,Surrogate key generated on SupportTopicFullP...,,Ok,ok
2,Timestamp in UTC at which record is inserted b...,,Ok,ok
3,Timestamp in UTC at which record is updated by...,,Ok,ok
4,Direct mapping to column SupportTopicFullPath ...,,Ok,ok
...,...,...,...,...
1569,Direct mapping to CASEID column coming from Sp...,ok,,
1570,Direct mapping to IssueCodeL4 column containin...,ok,,
1571,Direct mapping to BotHandledBy column containi...,ok,,
1572,Direct mapping to ProductCategory column conta...,ok,,


In [30]:
#Ref RNO 9 is a referenced row in the sheet
ref_rno_9 = '''If it is a direct mapping to Commerical attribute then add the sentence that it is a direct mapping to xyz column in commercial similar to  rno - 60 as Logic - .....
Otherwise use desc below to write the logic:
Ref desc below - SubmitterEmployeeKey -This is a reference key to EmpMapData entity present within MLGCS datamart.
Logic - CSIEscalation and EmpMapData tables are joined on EmpMapData.EmployeeEmail and CSIEscalation.SubmitterEmail 
coming from Critsit source and EmployeeKey is picked from EmpMapData table to populate SubmitterEmployeeKey.

e.g.-2: This is a reference key to DimGeography table to get more details about customer region related to the messaging impressions data.
Logic -'DisplayName' column from 'Domain_Geo' static domain file coming from Excel file source provided by CPR buisness, is joined with 'Market' column in DimGeography to pick GeographyKey'''
fail_descriptions['Review Comment L3'] = fail_descriptions['Review Comment L3'].replace('ref rno - 9 review comments', ref_rno_9)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fail_descriptions['Review Comment L3'] = fail_descriptions['Review Comment L3'].replace('ref rno - 9 review comments', ref_rno_9)


In [31]:
fail_descriptions.columns

Index(['TechnicalDescription', 'L1 Review Comment', 'L2 Review Comment',
       'Review Comment L3'],
      dtype='object', name=0)

In [32]:
#output technical description, review columns to json
fail_descriptions[['TechnicalDescription', 'L1 Review Comment', 'L2 Review Comment', 'Review Comment L3']].to_json('data/comments_ST02_MLGCS.json', orient='records')

In [33]:
# create new test dataframe with columns "description" and "fail"
temp_df1 = pd.DataFrame(columns=['description', 'fail', 'reason'])

In [34]:
# include all technical descriptions as "description" and "pass" as 1
temp_df1['description'] = pass_descriptions['TechnicalDescription']
temp_df1['fail'] = 0
temp_df1['reason'] = "N/A"

In [35]:
temp_df1.shape

(1358, 3)

In [36]:
# read in reverse_engineered.json
# Make all ReverseEngineeredDescriptions as "description" and "pass" as 0

with open('data/reverse_engineered.json') as f:
    data = json.load(f)
    temp_df2 = pd.DataFrame(data)

# reformat the columns to match test_df
temp_df2['fail'] = 1
temp_df2 = temp_df2.rename(columns={"reverse_engineered_description": "description"})
temp_df2['reason'] = temp_df2['comment']
temp_df2 = temp_df2.drop(columns=['comment'])
temp_df2 = temp_df2.drop(columns=['final_description'])
temp_df2.head()

Unnamed: 0,description,fail,reason
0,This Attribute has a static value - <3001> upl...,1,Provide the logic for surrogate key generation...
1,Direct mapping to ARRType column coming from D...,1,Is it a static source file? Re-check and phras...
2,Direct mapping to ServiceName column coming fr...,1,Is it a static source file? Re-check and phras...
3,Direct mapping to ServicePackageSKU column com...,1,Is it a static source file? Re-check and phras...
4,"This Attribute has a static value (2001,2002,2...",1,Provide the logic for surrogate key generation...


In [37]:
df.head()

Unnamed: 0,TechnicalDescription,L1 Review Comment,L2 Review Comment,Review Comment L3
0,This table contains details for issue and reso...,,Ok,ok
1,Surrogate key generated on SupportTopicFullP...,,Ok,ok
2,Timestamp in UTC at which record is inserted b...,,Ok,ok
3,Timestamp in UTC at which record is updated by...,,Ok,ok
4,Direct mapping to column SupportTopicFullPath ...,,Ok,ok


In [38]:
# output all final technical descriptions to json
all_technical_descriptions = df[['TechnicalDescription']]
all_technical_descriptions.to_json('data/final_descs_ST02_MLGCS.json', orient='records')