In [1]:
import re
import string

import joblib
import nltk
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

pd.set_option("display.max_rows", 3000)

In [65]:
filename = input("Enter the file name you wish to train: ")
sheetname = input("Enter sheet name from file: ")
mock_data = pd.read_excel(filename, sheet_name=sheetname)
print("\nThe file has {} rows and {} columns" .format(mock_data.shape[0], mock_data.shape[1]))

Enter the file name you wish to train: mockdata_set.xlsx
Enter sheet name from file: input_1_conduit_data
Data has 150 rows and 34 columns


In [3]:
mock_data.head(2)

Unnamed: 0,Short Desc,Long Desc,Image,Size,Length,Material,Type,Application,Finish,Screw Size,...,Width,Radius,Shape,Coating Thickness,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33
0,"Metallic Liquidtight Conduit, Flexible, LA, Ga...",_x000D_\nType LA_x000D_\n,0,2-1/2 in.,100 ft.,PVC Coated Galvanized Steel,LFMC,0,0,0,...,0,0,0,0,,,,,,
1,"Metallic Liquidtight Conduit, Flexible, Stainl...","\nMetallic Liquidtight Conduit, Flexible, Stai...",0,2 in.,25 ft.,stainless steel,LFMC,0,0,0,...,0,0,0,0,,,,,,


In [4]:
mock_data = mock_data[["Short Desc", "Long Desc", "Size", "Length", "Material", "Type"]]
mock_data.head(2)

Unnamed: 0,Short Desc,Long Desc,Size,Length,Material,Type
0,"Metallic Liquidtight Conduit, Flexible, LA, Ga...",_x000D_\nType LA_x000D_\n,2-1/2 in.,100 ft.,PVC Coated Galvanized Steel,LFMC
1,"Metallic Liquidtight Conduit, Flexible, Stainl...","\nMetallic Liquidtight Conduit, Flexible, Stai...",2 in.,25 ft.,stainless steel,LFMC


In [5]:
# rename the columns 
mock_data.columns = mock_data.columns.str.lower().str.replace(" ", "_")
mock_data.columns

Index(['short_desc', 'long_desc', 'size', 'length', 'material', 'type'], dtype='object')

## Clean up the data

In [6]:
# Check for missing values
mock_data.isnull().sum()

short_desc    0
long_desc     0
size          0
length        0
material      0
type          0
dtype: int64

In [7]:
# if 

## Preprocessing Data, Feature Selection and Model

In [8]:
labelencoder = LabelEncoder()

for column in ["size", "length", "material", "type"]:
    mock_data[column] = mock_data[column].astype(str)
    mock_data[column + "_"] = labelencoder.fit_transform(mock_data[column])
    
mock_data.head(2)

Unnamed: 0,short_desc,long_desc,size,length,material,type,size_,length_,material_,type_
0,"Metallic Liquidtight Conduit, Flexible, LA, Ga...",_x000D_\nType LA_x000D_\n,2-1/2 in.,100 ft.,PVC Coated Galvanized Steel,LFMC,6,2,6,7
1,"Metallic Liquidtight Conduit, Flexible, Stainl...","\nMetallic Liquidtight Conduit, Flexible, Stai...",2 in.,25 ft.,stainless steel,LFMC,5,9,13,7


In [9]:
mock_data.sort_values(by="type_")[["type", "type_"]]

Unnamed: 0,type,type_
27,EMT,0
33,EMT,0
32,EMT,0
31,EMT,0
30,EMT,0
29,EMT,0
28,EMT,0
36,EMT,0
37,EMT,0
38,EMT,0


In [10]:
type_ = str(dict(zip(mock_data["type_"], mock_data["type"]))).replace("\'", "\"")
material = str(dict(zip(mock_data["material_"], mock_data["material"]))).replace("\'", "\"")
size = str(dict(zip(mock_data["size_"], mock_data["size"]))).replace("\'", "\"")
length = str(dict(zip(mock_data["length_"], mock_data["length"]))).replace("\'", "\"")

filenames = ["type_", "material", "size", "length"]


for filename, content in zip(filenames, [type_, material, size, length]):
    with open("{}.txt".format(filename), "w") as f:
        f.write(content)

In [11]:
mock_data[mock_data["short_desc"] == "Non-Metallic Gray Liquidtight Flexible Conduit, 1-1/4 in."]

Unnamed: 0,short_desc,long_desc,size,length,material,type,size_,length_,material_,type_


In [12]:
type_

'{7: "LFMC", 8: "LFNC", 0: "EMT", 2: "FMC", 4: "GRC", 5: "IMC", 10: "PVCC", 11: "RMC", 1: "ENT", 3: "FNC", 6: "Innerduct", 9: "PVC"}'

In [13]:
mock_data[mock_data["type_"] == 0]

Unnamed: 0,short_desc,long_desc,size,length,material,type,size_,length_,material_,type_
27,"Electroplated Steel EMT Conduit, 1-1/4 in.",Product Overview:_x000D_\n1-1/4 in. Electropla...,1-1/4 in.,10 ft.,Galvanized Steel,EMT,3,0,1,0
28,"Electroplated Steel Thin EMT Conduit, Blue, 2 in.",Product Overview:\n2 in. Electroplated steel E...,2 in.,10 ft.,Galvanized Steel,EMT,5,0,1,0
29,"Electroplated Steel Thin EMT Conduit, Black, 2...",Product Overview:_x000D_\n2 in. Electroplated ...,2 in.,10 ft.,Galvanized Steel,EMT,5,0,1,0
30,"Hot-Galvanized Steel EMT Conduit, 4 Inch",Product Overview:_x000D_\n3/4 Inch emt conduit...,4 in.,20 ft.,Galvanized Steel,EMT,12,6,1,0
31,"Electroplated Steel EMT Conduit, Orange, 1/2 in.",Product Overview:_x000D_\n1/2 in. Electroplate...,1/2 in.,10 ft.,Galvanized Steel,EMT,4,0,1,0
32,"Western EMT Conduit, 1-1/2 in.","Product Overview:_x000D_\nEMT Conduit, Carbon ...",1-1/2 in.,10 ft.,Galvanized Steel,EMT,2,0,1,0
33,"Electroplated Steel EMT Conduit, White, 1 in.",Product Overview:_x000D_\nSteel EMT 1 inch ele...,1 in.,10 ft.,Galvanized Steel,EMT,1,0,1,0
34,"Electroplated Steel EMT Conduit, Green, 1-1/4 in.",Product Overview:_x000D_\n1-1/4 in. Electropla...,1-1/4 in.,10 ft.,Galvanized Steel,EMT,3,0,1,0
35,"Electroplated Steel Thin EMT Conduit, Red, 2 in.",Product Overview:_x000D_\n2 in. Electroplated ...,2 in.,20 ft.,Galvanized Steel,EMT,5,6,1,0
36,"Electroplated Steel EMT Conduit, Green, 1 in.",Product Overview:_x000D_\nSteel EMT 1 inch ele...,1 in.,10 ft.,Galvanized Steel,EMT,1,0,1,0


In [14]:
mock_data["type"].unique()

array(['LFMC', 'LFNC', 'EMT', 'FMC', 'GRC', 'IMC', 'PVCC', 'RMC', 'ENT',
       'FNC', 'Innerduct', 'PVC'], dtype=object)

In [15]:
mock_data["material"].unique()

array(['PVC Coated Galvanized Steel', 'stainless steel', 'PVC', 'Nylon',
       'Galvanized Steel', 'Stainless Steel', 'Aluminium', 'Plenum-PVDF',
       'Riser-Nylon', 'Riser-PVDF', 'HDPE-Schedule 80', 'HDPE',
       'Schedule 80', 'Schedule 40'], dtype=object)

In [16]:
size

'{6: "2-1/2 in.", 5: "2 in.", 4: "1/2 in.", 10: "3/4 in.", 2: "1-1/2 in.", 8: "3 in.", 3: "1-1/4 in.", 1: "1 in.", 12: "4 in.", 9: "3-1/2 in.", 13: "5 in.", 14: "6 in.", 0: "1 1/4 in.", 7: "21 mm", 11: "4"}'

In [17]:
stopwords = nltk.corpus.stopwords.words("english")
word_net = nltk.WordNetLemmatizer()


def clean_text(text):
    text = str(text).lower()
    text = str(text).replace("\n", " ")
    text = "".join(word for word in str(text) if word not in string.punctuation)
    tokens = re.split("\W+", text)
    lemmatized = [word_net.lemmatize(word) for word in tokens if word not in stopwords]
    return lemmatized

In [18]:
count_vector = CountVectorizer(analyzer=clean_text)
count_vector_ = CountVectorizer(analyzer=clean_text)

vector = count_vector.fit_transform(mock_data["short_desc"])
vector_ = count_vector_.fit_transform(mock_data["long_desc"])

In [19]:
short_desc_df = pd.DataFrame(vector.todense(), columns=count_vector.get_feature_names())
long_desc_df = pd.DataFrame(vector_.toarray(), columns=count_vector_.get_feature_names())

In [20]:
short_desc_df.head(2)

Unnamed: 0,0,0622,0642,065,0755,0785,084,1,10,100,...,ua,wall,weight,western,wet,white,without,x,xtra,yellow
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
short_long_desc_df = pd.concat([short_desc_df, long_desc_df], axis=1)
short_long_desc_df = short_long_desc_df.groupby(short_long_desc_df.columns, axis=1).sum()

independent_variables = short_long_desc_df.columns

In [22]:
material_df = pd.concat([short_long_desc_df, mock_data["material_"]], axis=1)
size_df = pd.concat([short_long_desc_df, mock_data["size_"]], axis=1)
length_df = pd.concat([short_long_desc_df, mock_data["length_"]], axis=1)
type_df = pd.concat([short_long_desc_df, mock_data["type_"]], axis=1)

In [23]:
material_df.shape, size_df.shape, type_df.shape, length_df.shape

((150, 820), (150, 820), (150, 820), (150, 820))

In [24]:
X = material_df[independent_variables]
y = material_df["material_"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, )

In [25]:
X_train = X_train.groupby(X_train.columns, axis=1).sum()
X_test = X_test.groupby(X_test.columns, axis=1).sum()

In [26]:
X_train.shape, X_test.shape

((105, 819), (45, 819))

In [27]:
def mae(y_true, y_pred):
    # mean absolute error
    return np.mean(abs(y_true - y_pred))

def training_and_evaluate(model):
    model.fit(X_train, y_train)
    
    model_pred = model.predict(X_test)
    
    model_mae = mae(y_test, model_pred)
    
    return model_mae

In [28]:
rf = RandomForestClassifier()
dt = DecisionTreeClassifier()

In [60]:
print("==========================================\n")
print("          Mean Absolute Errors:")
print("    (lower error => better performance)")
print("\n==========================================")


          Mean Absolute Errors:
    (lower error => better performance)



In [29]:
def modelling(df, target_variable, models):
    print(target_variable)
    X = df[independent_variables]
    y = df[target_variable]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    
    for model in models:
        
        model.fit(X_train, y_train)
        model_pred = model.predict(X_test)
        model_mae = np.mean(abs(y_test - model_pred))

        print("{} Mean Absolute Error: {}".format(model, model_mae))
    print("\n")

In [30]:
for df, column in zip([material_df, size_df, length_df, type_df], ["material_", "size_", "length_", "type_"]):
    modelling(df, column, [rf, dt])

material_
RandomForestClassifier() Mean Absolute Error: 0.28888888888888886
DecisionTreeClassifier() Mean Absolute Error: 0.28888888888888886


size_
RandomForestClassifier() Mean Absolute Error: 1.0444444444444445
DecisionTreeClassifier() Mean Absolute Error: 0.9555555555555556


length_
RandomForestClassifier() Mean Absolute Error: 3.6666666666666665
DecisionTreeClassifier() Mean Absolute Error: 2.4


type_
RandomForestClassifier() Mean Absolute Error: 0.5777777777777777
DecisionTreeClassifier() Mean Absolute Error: 0.5777777777777777




In [58]:
print("==========================")
print("     Accuracy Scores:")
print("==========================")

     Accuracy Scores:


In [31]:
X = material_df[independent_variables]
X = X.groupby(X.columns, axis=1).sum()

y = material_df["material_"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [32]:
material_rf = rf.fit(X_train, y_train)
material_dt = dt.fit(X_train, y_train)

In [33]:
df = pd.DataFrame(columns=X_train.columns)
df.to_excel("empty_dataframe.xlsx")

In [34]:
material_rf_pred = material_rf.predict(X_test)
print("material: {}".format(accuracy_score(y_test, material_rf_pred)))

material: 0.9555555555555556


In [35]:
joblib.dump(material_rf, "./material_random_forest.joblib", compress=True)
joblib.dump(material_dt, "./material_decision_trees.joblib", compress=True)

['./material_decision_trees.joblib']

In [36]:
X = length_df[independent_variables]
y = length_df["length_"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [37]:
length_rf = rf.fit(X_train, y_train)
length_dt = dt.fit(X_train, y_train)

In [38]:
length_rf_pred = length_rf.predict(X_test)
print("length: {}".format(accuracy_score(y_test, length_rf_pred)))

length: 0.6444444444444445


In [39]:
joblib.dump(length_rf, "./length_random_forest.joblib", compress=True)
joblib.dump(length_dt, "./length_decision_trees.joblib", compress=True)

['./length_decision_trees.joblib']

In [40]:
X = size_df[independent_variables]
y = size_df["size_"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [41]:
size_rf = rf.fit(X_train, y_train)
size_dt = dt.fit(X_train, y_train)

In [42]:
size_rf_pred = size_rf.predict(X_test)
print("size: {}".format(accuracy_score(y_test, size_rf_pred)))

size: 0.6888888888888889


In [43]:
joblib.dump(size_rf, "./size_random_forest.joblib", compress=True)
joblib.dump(size_dt, "./size_decision_trees.joblib", compress=True)

['./size_decision_trees.joblib']

In [44]:
X = type_df[independent_variables]
y = type_df["type_"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [45]:
type_rf = rf.fit(X_train, y_train)
type_dt = dt.fit(X_train, y_train)

In [46]:
type_rf_pred = type_rf.predict(X_test)
print("type: {}".format(accuracy_score(y_test, type_rf_pred)))

type: 0.9111111111111111


In [47]:
joblib.dump(type_rf, "./type_random_forest.joblib", compress=True)
joblib.dump(type_dt, "./type_decision_trees.joblib", compress=True)

['./type_decision_trees.joblib']