### **Problem 1: Shift-Based Operator Performance and Anomaly Flagging**

The QA department wants to evaluate operator consistency across shifts and detect any red flags.

**Your tasks:**

1. Normalize operator names (e.g., title case, strip whitespace).
2. Extract and validate phone numbers (must be 10-digit only).
3. Ensure email addresses are valid (basic regex rule).
4. Count total batches handled by each operator, grouped by shift.
5. Identify operators whose scrap rate averaged > 10% across the month.
6. Flag any batch that failed QA **and** had scrap rate > 15%.
7. Summarize: Which shift had the highest failure rate?

*Hint: You may need `.groupby()` and aggregation, and build a `Flag` column conditionally.*


In [658]:
import pandas as pd
import numpy as np
import re

In [659]:
data = pd.read_csv('Spool_Manufacturing_Batch_Log.csv')

In [660]:
df = pd.DataFrame(data)

In [661]:
df.head(3)

Unnamed: 0,Batch ID,Date Produced,Material Type,Color,Production Line,Weight (g),Scrap Rate (%),Pass/Fail,Operator,Phone,Email,Shift,Machine Barcode,Lot Number
0,eb6221c8-f45a-49f6-8c0c-ee28f5a29fc0,2025-05-01,PLA,Black,Line 2,1024.84,1.79,Pass,Jacqueline Bass,001-988-061-3911x7775,haynesdavid@yahoo.com,Shift C,MCH-001,L9935
1,9748d109-45e1-4bb0-98af-53396946b791,2025-05-01,PLA,Red,Line 1,1032.38,4.28,Pass,Kristen Cole,300-905-2906x4997,theodore63@yahoo.com,Shift A,MCH-001,L4257
2,35de154c-67d6-4144-a9e2-8afe65353fb2,2025-05-01,ABS,Blue,Line 4,988.29,1.65,Pass,Sherry Bryant,001-741-699-1830x254,timothy04@knox.net,Shift C,MCH-001,L3615


In [662]:
# 1. Normalize operator names (e.g., title case, strip whitespace).
df['Operator'] = df['Operator'].str.title().str.strip()

In [663]:
df['Operator'].head(3)

0    Jacqueline Bass
1       Kristen Cole
2      Sherry Bryant
Name: Operator, dtype: object

In [664]:
# 2. Extract and validate phone numbers (must be 10-digit only).
df['Phone'] = df['Phone'].astype(str)

In [665]:
df['Phone Digits'] = df['Phone'].str.extract(r'(\d{10})')

In [666]:
df['Valid Phone'] = df['Phone Digits'].apply(lambda x: 'VALID' if pd.notnull(x) else 'INVALID')

In [667]:
df[['Phone', 'Phone Digits','Valid Phone']].head()

Unnamed: 0,Phone,Phone Digits,Valid Phone
0,001-988-061-3911x7775,,INVALID
1,300-905-2906x4997,,INVALID
2,001-741-699-1830x254,,INVALID
3,300-905-2906x4997,,INVALID
4,300-905-2906x4997,,INVALID


In [668]:
# Format Phone Numbers
# Convert to string
df['Phone'] = df['Phone'].astype(str)


In [669]:
# Split extension if exists
df[['Phone Base', 'Extension']] = df['Phone'].str.split('x', n=1, expand=True)

In [670]:
df[['Phone Base', 'Extension']].head(3)

Unnamed: 0,Phone Base,Extension
0,001-988-061-3911,7775
1,300-905-2906,4997
2,001-741-699-1830,254


In [671]:
# Remove non-digits from Phone Base
df['Digits Only'] = df['Phone Base'].str.replace(r'\D', '', regex=True)

In [672]:
df[['Phone', 'Phone Base', 'Digits Only', 'Extension']].head(3)

Unnamed: 0,Phone,Phone Base,Digits Only,Extension
0,001-988-061-3911x7775,001-988-061-3911,19880613911,7775
1,300-905-2906x4997,300-905-2906,3009052906,4997
2,001-741-699-1830x254,001-741-699-1830,17416991830,254


In [673]:
# Determine if it has Country Code
df['Country Code'] = df['Digits Only'].apply(lambda x: x[:3] if len(x) > 10 else '')

In [674]:
# Get phone digits
df['Phone Digits'] = df['Digits Only'].apply(lambda x: x[-10:] if len(x) >= 10 else '')

In [675]:
df[['Phone', 'Phone Base', 'Country Code', 'Phone Digits', 'Extension']].head(3)

Unnamed: 0,Phone,Phone Base,Country Code,Phone Digits,Extension
0,001-988-061-3911x7775,001-988-061-3911,1.0,9880613911,7775
1,300-905-2906x4997,300-905-2906,,3009052906,4997
2,001-741-699-1830x254,001-741-699-1830,1.0,7416991830,254


In [676]:
# Format Phone digits
df['Formatted Phone'] = df['Phone Digits'].apply(
    lambda x: f'{x[:3]}-{x[3:6]}-{x[6:]}' if len(x) == 10 else ''
)

In [677]:
df[['Phone', 'Country Code', 'Formatted Phone', 'Extension']].head(3)

Unnamed: 0,Phone,Country Code,Formatted Phone,Extension
0,001-988-061-3911x7775,1.0,988-061-3911,7775
1,300-905-2906x4997,,300-905-2906,4997
2,001-741-699-1830x254,1.0,741-699-1830,254


In [678]:
# add full constructed version
df['Final Phone'] = df.apply(
    lambda row: f"+{row['Country Code']} {row['Formatted Phone']} ext. {row['Extension']}" if row['Extension'] else f"+{row['Country Code']} {row['Formatted Phone']}", axis=1
)

In [679]:
df[['Phone', 'Country Code', 'Formatted Phone', 'Extension', 'Final Phone']].head(3)

Unnamed: 0,Phone,Country Code,Formatted Phone,Extension,Final Phone
0,001-988-061-3911x7775,1.0,988-061-3911,7775,+001 988-061-3911 ext. 7775
1,300-905-2906x4997,,300-905-2906,4997,+ 300-905-2906 ext. 4997
2,001-741-699-1830x254,1.0,741-699-1830,254,+001 741-699-1830 ext. 254


In [680]:
# 3. Ensure email addresses are valid (basic regex rule).
# email_pattern = r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'
email_pattern = (
    r'^(?!.*\.\.)'                     # no double dots
    r'(?!\.)(?!.*\.$)'                 # cannot start or end with dot
    r'[A-Za-z0-9._%+-]+'
    r'@'
    r'(?!-)[A-Za-z0-9.-]+(?<!-)'       # domain cannot start or end with hyphen
    r'\.[A-Za-z]{2,}$'
)

In [681]:
df['Email'] = df['Email'].str.strip().str.lower()

In [682]:
df['Valid Email'] = df['Email'].str.match(email_pattern)

In [683]:
df[['Email', 'Valid Email']].head(3)

Unnamed: 0,Email,Valid Email
0,haynesdavid@yahoo.com,True
1,theodore63@yahoo.com,True
2,timothy04@knox.net,True


In [684]:
# 4. Count total batches handled by each operator, grouped by shift.
df_summary = df.groupby(['Operator', 'Shift'])['Batch ID'].size().unstack().fillna(0)

In [685]:
df_summary.head()

Shift,Shift A,Shift B,Shift C
Operator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amanda Anderson,11,27,18
David Pittman,22,24,19
Jacqueline Bass,15,23,22
Kristen Cole,19,21,21
Mary Anderson,22,19,17


In [686]:
# add total per operator (row-wise)
df_summary['Total'] = df_summary.sum(axis=1)

In [687]:
df_summary

Shift,Shift A,Shift B,Shift C,Total
Operator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Amanda Anderson,11,27,18,56
David Pittman,22,24,19,65
Jacqueline Bass,15,23,22,60
Kristen Cole,19,21,21,61
Mary Anderson,22,19,17,58
Patrick Meyer,18,22,17,57
Paul Macdonald,22,22,23,67
Rita Graves,22,17,20,59
Sherry Bryant,23,16,18,57


In [688]:
# add total per shift (column-wise)
df_summary.loc['Total'] = df_summary.sum()

In [689]:
df_summary

Shift,Shift A,Shift B,Shift C,Total
Operator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Amanda Anderson,11,27,18,56
David Pittman,22,24,19,65
Jacqueline Bass,15,23,22,60
Kristen Cole,19,21,21,61
Mary Anderson,22,19,17,58
Patrick Meyer,18,22,17,57
Paul Macdonald,22,22,23,67
Rita Graves,22,17,20,59
Sherry Bryant,23,16,18,57
Total,174,191,175,540


In [690]:
# 5. Identify operators whose scrap rate averaged > 10% across the month.
avg_scrap_rate_per_operator = df.groupby('Operator')['Scrap Rate (%)'].mean()

In [691]:
avg_scrap_rate_per_operator

Operator
Amanda Anderson    2.046786
David Pittman      2.037846
Jacqueline Bass    2.159667
Kristen Cole       2.055902
Mary Anderson      1.867759
Patrick Meyer      2.291404
Paul Macdonald     2.348358
Rita Graves        1.992881
Sherry Bryant      2.450877
Name: Scrap Rate (%), dtype: float64

In [692]:
monthly_scrap_rate_avg_at_10_percent = df['Scrap Rate (%)'].mean() * 1.1

In [693]:
monthly_scrap_rate_avg_at_10_percent

np.float64(2.354061111111111)

In [694]:
df['Scrap Rate (%) per Operator'] = df['Operator'].map(avg_scrap_rate_per_operator)

In [695]:
df[['Operator', 'Scrap Rate (%) per Operator']].head(3)

Unnamed: 0,Operator,Scrap Rate (%) per Operator
0,Jacqueline Bass,2.159667
1,Kristen Cole,2.055902
2,Sherry Bryant,2.450877


In [696]:
df['Scrap Alert'] = df['Scrap Rate (%) per Operator'] > monthly_scrap_rate_avg_at_10_percent

In [697]:
df[['Operator', 'Scrap Rate (%) per Operator','Scrap Alert']].head(3)

Unnamed: 0,Operator,Scrap Rate (%) per Operator,Scrap Alert
0,Jacqueline Bass,2.159667,False
1,Kristen Cole,2.055902,False
2,Sherry Bryant,2.450877,True


In [698]:
# 6. Flag any batch that failed QA **and** had scrap rate > 15%.
threshold = df['Scrap Rate (%)'].mean() * 1.15
df['Failed and Scrap Rate > 15%'] = (df['Pass/Fail'] == 'Fail') & (df['Scrap Rate (%)'] > threshold)

In [699]:
df[['Batch ID', 'Scrap Rate (%)', 'Pass/Fail', 'Failed and Scrap Rate > 15%']].head(3)

Unnamed: 0,Batch ID,Scrap Rate (%),Pass/Fail,Failed and Scrap Rate > 15%
0,eb6221c8-f45a-49f6-8c0c-ee28f5a29fc0,1.79,Pass,False
1,9748d109-45e1-4bb0-98af-53396946b791,4.28,Pass,False
2,35de154c-67d6-4144-a9e2-8afe65353fb2,1.65,Pass,False


In [700]:
df['Batch Flag'] = df['Failed and Scrap Rate > 15%'].apply(lambda x: 'FLAGGED' if x else '')

In [704]:
df.groupby('Batch ID')['Failed and Scrap Rate > 15%'].sum().tail()

Batch ID
fd8ae99f-34b5-4bf8-b9b9-0b8cc22e1d39    0
fd97a12c-66fa-46d4-be36-5e6c003f797e    0
fe3467eb-73a5-46a8-a27a-fa3eb8be30a8    0
fe5468cf-16e5-4093-860e-dfd36c36a1a4    0
ff3b07a7-a033-48d5-abc1-b1e2a4bf419a    0
Name: Failed and Scrap Rate > 15%, dtype: int64

In [705]:
flagged_batches = df[df['Failed and Scrap Rate > 15%']]

In [706]:
flagged_batches

Unnamed: 0,Batch ID,Date Produced,Material Type,Color,Production Line,Weight (g),Scrap Rate (%),Pass/Fail,Operator,Phone,...,Extension,Digits Only,Country Code,Formatted Phone,Final Phone,Valid Email,Scrap Rate (%) per Operator,Scrap Alert,Failed and Scrap Rate > 15%,Batch Flag
29,09b1234d-7578-4bc7-ad84-d28363101b1e,2025-05-02,ABS,Blue,Line 4,1016.56,3.46,Fail,Paul Macdonald,493-813-3210x19538,...,19538,4938133210,,493-813-3210,+ 493-813-3210 ext. 19538,True,2.348358,False,True,FLAGGED
32,81d0d014-b5ae-43bf-b29f-3072c0792ca2,2025-05-02,TPU,White,Line 5,1040.63,4.03,Fail,Amanda Anderson,406-315-7403x927,...,927,4063157403,,406-315-7403,+ 406-315-7403 ext. 927,True,2.046786,False,True,FLAGGED
36,d4e8db7f-8da1-4c62-b0bd-2bb048c05433,2025-05-03,PLA,White,Line 5,998.21,4.35,Fail,Jacqueline Bass,001-988-061-3911x7775,...,7775,19880613911,1.0,988-061-3911,+001 988-061-3911 ext. 7775,True,2.159667,False,True,FLAGGED
37,d8531e8e-d31a-453c-9b3d-06edfdde57d2,2025-05-03,TPU,Black,Line 2,869.01,3.23,Fail,Rita Graves,538.709.6430x8749,...,8749,5387096430,,538-709-6430,+ 538-709-6430 ext. 8749,True,1.992881,False,True,FLAGGED
88,3d589292-53b7-4e99-b88a-9e8c7661b2be,2025-05-05,ABS,Black,Line 4,1000.65,4.18,Fail,David Pittman,819-303-1909x4296,...,4296,8193031909,,819-303-1909,+ 819-303-1909 ext. 4296,True,2.037846,False,True,FLAGGED
105,aa65b1a5-ed79-4a12-8951-d5a2b8eb0cb5,2025-05-06,PLA,Red,Line 2,1028.54,3.7,Fail,Mary Anderson,247-382-6261x52923,...,52923,2473826261,,247-382-6261,+ 247-382-6261 ext. 52923,True,1.867759,False,True,FLAGGED
119,6e55f7e4-7186-4859-a27e-a1358bb196f0,2025-05-07,TPU,White,Line 4,966.91,3.28,Fail,Paul Macdonald,493-813-3210x19538,...,19538,4938133210,,493-813-3210,+ 493-813-3210 ext. 19538,True,2.348358,False,True,FLAGGED
124,1d892de2-eaba-48ec-8029-c5b4499b5799,2025-05-07,PLA,Red,Line 4,1088.27,2.61,Fail,Mary Anderson,247-382-6261x52923,...,52923,2473826261,,247-382-6261,+ 247-382-6261 ext. 52923,True,1.867759,False,True,FLAGGED
129,70a21345-d127-4bde-8bbe-84ce3fdf86bb,2025-05-08,PLA,Red,Line 4,1022.19,3.16,Fail,Sherry Bryant,001-741-699-1830x254,...,254,17416991830,1.0,741-699-1830,+001 741-699-1830 ext. 254,True,2.450877,True,True,FLAGGED
155,3cf4f4b8-9639-416f-9ab9-f721140d879f,2025-05-09,TPU,Black,Line 4,989.12,3.65,Fail,Sherry Bryant,001-741-699-1830x254,...,254,17416991830,1.0,741-699-1830,+001 741-699-1830 ext. 254,True,2.450877,True,True,FLAGGED


In [713]:
# 7. Summarize: Which shift had the highest failure rate?
failure = df[df['Pass/Fail'] == 'Fail'].groupby('Shift')['Pass/Fail'].count().sort_values(ascending=False)

In [714]:
failure

Shift
Shift C    27
Shift B    19
Shift A    15
Name: Pass/Fail, dtype: int64

In [719]:
failure_rate = (
    df.groupby('Shift')['Pass/Fail']
    .apply(lambda s: round((s == 'Fail').sum() / s.count() * 100, 2))
    .sort_values(ascending=False)
)

In [720]:
failure_rate

Shift
Shift C    15.43
Shift B     9.95
Shift A     8.62
Name: Pass/Fail, dtype: float64