In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

%matplotlib inline  

In [2]:
data = pd.read_csv('../SiP_dataset-master/Sip-task-info.csv', encoding='cp1252') 
data.head()

Unnamed: 0,TaskNumber,Summary,Priority,RaisedByID,AssignedToID,AuthorisedByID,StatusCode,ProjectCode,ProjectBreakdownCode,Category,SubCategory,HoursEstimate,HoursActual,DeveloperID,DeveloperHoursActual,TaskPerformance,DeveloperPerformance
0,1735,Flag RI on SCM Message Summary screen using me...,1,58,58,6.0,FINISHED,PC2,PBC42,Development,Enhancement,14.0,1.75,58,1.75,12.25,12.25
1,1742,Allow RI Policies to be marked as Exhausted,1,58,42,6.0,FINISHED,PC2,PBC21,Development,Enhancement,7.0,7.0,42,7.0,0.0,0.0
2,1971,Fix Invalid UWREF Line DX402L99A1N,2,7,58,6.0,FINISHED,PC2,PBC75,Operational,In House Support,0.7,0.7,58,0.7,0.0,0.0
3,2134,New rows in the diary event for the SCM are re...,5,50,42,6.0,FINISHED,PC2,PBC42,Development,Bug,0.7,0.7,42,0.7,0.0,0.0
4,2251,Application Screen Size - Need to set Min Size...,10,46,13,6.0,FINISHED,PC2,PBC21,Development,Bug,3.5,3.5,13,3.5,0.0,0.0


In [3]:
# Check the diffrent values in unclear columns
print("StatusCode values: \n{}".format(pd.unique(data["StatusCode"])))
print("ProjectCode values: \n{}".format(pd.unique(data["ProjectCode"])))
print("ProjectBreakdownCode values: \n{}".format(pd.unique(data["ProjectBreakdownCode"])))
print("Category values: \n{}".format(pd.unique(data["Category"])))
print("SubCategory values: \n{}".format(pd.unique(data["SubCategory"])))


StatusCode values: 
['FINISHED' 'CANCELLED' 'RELEASED' 'COMPLETED' 'CHRONICLE' 'ESTIMATED'
 'TEMPLATE' 'AUTHORISE']
ProjectCode values: 
['PC2' 'PC9' 'PC11' 'PC17' 'PC8' 'PC16' 'PC18' 'PC19' 'PC14' 'PC20' 'PC4'
 'PC7' 'PC15' 'PC13' 'PC1' 'PC6' 'PC5' 'PC3' 'PC12' 'PC10']
ProjectBreakdownCode values: 
['PBC42' 'PBC21' 'PBC75' 'PBC11' 'PBC10' 'PBC65' 'PBC53' 'PBC73' 'PBC40'
 'PBC18' 'PBC38' 'PBC15' 'PBC33' 'PBC56' 'PBC35' 'PBC24' 'PBC3' 'PBC6'
 'PBC7' 'PBC5' 'PBC20' 'PBC43' 'PBC31' 'PBC46' 'PBC64' 'PBC12' 'PBC77'
 'PBC62' 'PBC32' 'PBC63' 'PBC26' 'PBC72' 'PBC34' 'PBC13' 'PBC61' 'PBC16'
 'PBC44' 'PBC23' 'PBC41' 'PBC17' 'PBC49' 'PBC30' 'PBC54' 'PBC52' 'PBC59'
 'PBC22' 'PBC25' 'PBC58' 'PBC60' 'PBC36' 'PBC8' 'PBC50' 'PBC28' 'PBC69'
 'PBC37' 'PBC55' 'PBC45' 'PBC47' 'PBC48' 'PBC76' 'PBC9' 'PBC4' 'PBC67'
 'PBC2' 'PBC66' 'PBC71' 'PBC29' 'PBC14' 'PBC51' 'PBC70' 'PBC19' 'PBC74'
 'PBC57' 'PBC1' 'PBC39' 'PBC68' 'PBC27']
Category values: 
['Development' 'Operational' 'Management']
SubCategory values: 


In [13]:
relevent_attributes=["Priority","RaisedByID","AssignedToID","AuthorisedByID",\
                     "StatusCode","ProjectCode","Category","SubCategory","HoursEstimate",\
                     "HoursActual"]
relevent_data = data[relevent_attributes]
relevent_data.dtypes

Priority            int64
RaisedByID          int64
AssignedToID        int64
AuthorisedByID    float64
StatusCode         object
ProjectCode        object
Category           object
SubCategory        object
HoursEstimate     float64
HoursActual       float64
dtype: object

# Change attribute to continuos attributes

In [14]:
obj_df=relevent_data.copy()
obj_df["StatusCode"] = obj_df["StatusCode"].astype('category')
obj_df["ProjectCode"] = obj_df["ProjectCode"].astype('category')
obj_df["Category"] = obj_df["Category"].astype('category')
obj_df["SubCategory"] = obj_df["SubCategory"].astype('category')
obj_df.dtypes

Priority             int64
RaisedByID           int64
AssignedToID         int64
AuthorisedByID     float64
StatusCode        category
ProjectCode       category
Category          category
SubCategory       category
HoursEstimate      float64
HoursActual        float64
dtype: object

In [15]:
obj_df["StatusCode"]  = obj_df["StatusCode"].cat.codes
obj_df["ProjectCode"] = obj_df["ProjectCode"].cat.codes
obj_df["Category"]    = obj_df["Category"].cat.codes
obj_df["SubCategory"] = obj_df["SubCategory"].cat.codes
obj_df.head()

Unnamed: 0,Priority,RaisedByID,AssignedToID,AuthorisedByID,StatusCode,ProjectCode,Category,SubCategory,HoursEstimate,HoursActual
0,1,58,58,6.0,5,11,0,7,14.0,1.75
1,1,58,42,6.0,5,11,0,7,7.0,7.0
2,2,7,58,6.0,5,11,2,9,0.7,0.7
3,5,50,42,6.0,5,11,0,1,0.7,0.7
4,10,46,13,6.0,5,11,0,1,3.5,3.5


# Scale 0-1

In [17]:
from sklearn import preprocessing

x = obj_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
obj_df = pd.DataFrame(x_scaled,columns=obj_df.columns)
obj_df.head()


Unnamed: 0,Priority,RaisedByID,AssignedToID,AuthorisedByID,StatusCode,ProjectCode,Category,SubCategory,HoursEstimate,HoursActual
0,0.0,0.863636,0.890625,0.0,0.714286,0.578947,0.0,0.304348,0.015374,0.000699
1,0.0,0.863636,0.640625,0.0,0.714286,0.578947,0.0,0.304348,0.007681,0.002807
2,0.111111,0.090909,0.890625,0.0,0.714286,0.578947,1.0,0.391304,0.000758,0.000277
3,0.444444,0.742424,0.640625,0.0,0.714286,0.578947,0.0,0.043478,0.000758,0.000277
4,1.0,0.681818,0.1875,0.0,0.714286,0.578947,0.0,0.043478,0.003835,0.001402


In [18]:
obj_df.to_csv('../SiP_dataset-master/SIP_CAT.csv')

# Change attribute to categorical

In [None]:
obj_df=relevent_data.copy()
obj_df["StatusCode"] = obj_df["StatusCode"].astype('category')
obj_df["ProjectCode"] = obj_df["ProjectCode"].astype('category')
obj_df["Category"] = obj_df["Category"].astype('category')
obj_df["SubCategory"] = obj_df["SubCategory"].astype('category')
obj_df.dtypes