# Model to Predict Cycle Count Variance Probabilities

### The features which will be inputted are:

##### Part Volume: the volume of the Each footprint for the part
##### Part Weight: the net weight of the Each footprint for the part
##### ABC Code: the ABC Code for the part
##### Time Since Last Count: the number of days elapsed since the part/location combination was counted
##### Total Picks: the number of picks completed for this part/location combination since it was last counted
##### Count Variance: 0 = not variant, 1 = variant

#### Note: the data ignores Count Audits, and only considers count history records for non-Audit Count Types

In [13]:
#imports
import pandas as pd
import os

#validate current working directory and convert data to pandas dataframe
print(os.getcwd())
train = pd.read_excel("./count_train_data.xlsx")
test = pd.read_excel("./count_test_data.xlsx")

C:\Users\034993\Documents\Jupyter Notebooks\Count Variance Project


In [14]:
print("Train Data:")
print("")
print(train.head())
print("")
print("Test Data:")
print("")
test.head()

Train Data:

   part_volume  part_weight abccod  time_since_last_cntdte  total_picks  \
0          1.0          1.0      C                   99999            0   
1        576.0         16.0      C                   99999            0   
2          1.0          1.0      C                   99999            0   
3        126.0         16.0      C                     279            0   
4          1.0          1.0      C                      28            0   

   count_variance  
0               0  
1               0  
2               0  
3               0  
4               0  

Test Data:



Unnamed: 0,part_volume,part_weight,abccod,time_since_last_cntdte,total_picks,count_variance
0,576.0,16,C,99999,0,0
1,1.0,1,C,99999,0,0
2,1.0,16,C,99999,0,0
3,1.0,1,C,99999,0,0
4,1.0,1,C,99999,0,0


In [15]:
# validate all fields are non-null and look at data types
train.info()
print(" ")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 6 columns):
part_volume               999 non-null float64
part_weight               999 non-null float64
abccod                    999 non-null object
time_since_last_cntdte    999 non-null int64
total_picks               999 non-null int64
count_variance            999 non-null int64
dtypes: float64(2), int64(3), object(1)
memory usage: 46.9+ KB
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 6 columns):
part_volume               199 non-null float64
part_weight               199 non-null int64
abccod                    199 non-null object
time_since_last_cntdte    199 non-null int64
total_picks               199 non-null int64
count_variance            199 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 9.4+ KB


In [16]:
# look at a correlation matrix to see if any features are strongly correlated to count variance
corr_matrix = train.corr()
corr_matrix['count_variance'].sort_values(ascending=False)

count_variance            1.000000
total_picks               0.072343
time_since_last_cntdte    0.035958
part_weight              -0.008698
part_volume              -0.024605
Name: count_variance, dtype: float64

In [17]:
# create train and test X,y variables
X_train = train.drop(['count_variance'], axis=1)
y_train = train['count_variance']

X_test = test.drop(['count_variance'], axis=1)
y_test = test['count_variance']

In [18]:
#encode categorial variables to integer label (abccod field)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

X_train.abccod = le.fit_transform(X_train.abccod)
X_test.abccod = le.fit_transform(X_test.abccod)

In [19]:
#import Logistic Regression model and train it with training data

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
#predict values for the test data
logistic.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [21]:
#look at probabilities of test data being 0 to 1
logistic.predict_proba(X_test)

array([[ 0.99031824,  0.00968176],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.98978511,  0.01021489],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.98978511,  0.01021489],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.93120719,  0.06879281],
       [ 0.93393456,  0.06606544],
       [ 0.970387  ,  0.029613  ],
       [ 0.98988647,  0.01011353],
       [ 0.93124197,  0.06875803],
       [ 0.98979306,  0.01020694],
       [ 0.57720023,  0.42279977],
       [ 0.57833891,  0.42166109],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.99031824,  0.00968176],
       [ 0.57724829,  0.42275171],
       [ 0.57758468,  0.42241532],
       [ 0.98978511,  0.01021489],
       [ 0.57746936,  0.42253064],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.9072378 ,  0.0927622 ],
       [ 0.57810355,  0.42189645],
       [ 0.57766156,

In [22]:
#save model
import pickle

filename = 'count_model.sav'
pickle.dump(logistic, open(filename, 'wb'))

In [23]:
#load saved model and test it
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.969849246231
