In [1]:
# importing necessary modules
import pandas as pd

In [2]:
# Loading the test files
patient_monthwise_revenue_test = pd.read_csv('data/patient_monthwise_revenue_test.csv')
physio_diagnosis_test = pd.read_csv('data/physio_diagnosis_test.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Revenue test information
patient_monthwise_revenue_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55329 entries, 0 to 55328
Data columns (total 20 columns):
patient_id          55329 non-null int64
visit_month_year    55329 non-null object
service_id          55329 non-null int64
city                55329 non-null object
ref_type            55329 non-null object
ref_name            55329 non-null object
ref_source          55329 non-null object
service_name        54477 non-null object
FVD                 55329 non-null object
FVM                 55329 non-null object
FVS                 55329 non-null object
approx_age          55329 non-null object
gender              55298 non-null object
LVD                 55329 non-null object
brand               55329 non-null object
visits_required     55329 non-null int64
diagnosis           54162 non-null object
avg_nps             55329 non-null object
Unnamed: 18         28 non-null object
Unnamed: 19         1 non-null object
dtypes: int64(3), object(17)
memory usage: 8.4+ MB


In [4]:
# Diagnosis test information
physio_diagnosis_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32455 entries, 0 to 32454
Data columns (total 8 columns):
patientId     32455 non-null int64
diagnosis     31786 non-null object
Service       32455 non-null object
AGE           32455 non-null object
gender        32430 non-null object
LVD           32455 non-null object
Unnamed: 6    13 non-null object
Unnamed: 7    1 non-null object
dtypes: int64(1), object(7)
memory usage: 2.0+ MB


In [5]:
# Cleaning and merging data

# Removing unnecessary data

# FVS - doesnt have proper information?
# diagnosis - duplicate data with 'physio_diagnosis_train'
# may be if it is missing in train_classified for particular id, we need to take the value from here ?
# service_name - since we have service_id
# approx_age - since we have AGE in physio_diagnosis_train
# Unnamed: 18 - not required
# Unamed: 19 - not required
patient_monthwise_revenue_test = patient_monthwise_revenue_test.drop(columns=
                                                                       ['FVS', 'diagnosis','service_name', 'approx_age',
                                                                        'Unnamed: 18', 'Unnamed: 19'])

In [6]:
# gender - it is there in patient_monthwise_revenue_train
# LVD - it is there in patient_monthwise_revenue_train
# Service - we have service_id in patient_monthwise_revenue_train
# Unnamed: 7 - not required
# Unamed: 8 - not required
# Appt_id - since there is no mapping of training data we are loading for analysis
physio_diagnosis_test = physio_diagnosis_test.drop(columns=
                                                     ['gender', 'LVD', 'Unnamed: 6', 'Unnamed: 7', 'Service'])

In [7]:
# Getting the visit_count by doing groupby of patient_id values
# Getting only patient_id
df_pat_id = pd.DataFrame(patient_monthwise_revenue_test['patient_id'])
# Performing Group by -> reset index -> renaming column to 'visit_count'
df_pat_visit_count = df_pat_id.groupby(df_pat_id.columns.tolist(),as_index=False).size().reset_index().rename(columns={0:'visit_count'})

In [8]:
# Adding visit count also
md_monthwise_revenue_visit_count_test = pd.merge(patient_monthwise_revenue_test, df_pat_visit_count, on='patient_id')

In [9]:
physio_diagnosis_test = physio_diagnosis_test.rename(columns={'patientId': 'patient_id'})

In [10]:
full_test_raw = pd.merge(md_monthwise_revenue_visit_count_test, physio_diagnosis_test, on='patient_id')

In [11]:
full_test_raw.columns.values

array(['patient_id', 'visit_month_year', 'service_id', 'city', 'ref_type',
       'ref_name', 'ref_source', 'FVD', 'FVM', 'gender', 'LVD', 'brand',
       'visits_required', 'avg_nps', 'visit_count', 'diagnosis', 'AGE'],
      dtype=object)

In [12]:
# New order columns as per the same order as train data
new_cols_order = ['patient_id', 'visit_month_year', 'service_id', 'visit_count', 'city', 'ref_type',
       'ref_name', 'ref_source', 'FVD', 'FVM', 'gender', 'LVD', 'brand',
       'visits_required', 'avg_nps', 'diagnosis', 'AGE']

In [13]:
full_test_raw = full_test_raw[new_cols_order]

In [14]:
# Looking at the new order now, which should be the same order as train data
full_test_raw.columns.values

array(['patient_id', 'visit_month_year', 'service_id', 'visit_count',
       'city', 'ref_type', 'ref_name', 'ref_source', 'FVD', 'FVM',
       'gender', 'LVD', 'brand', 'visits_required', 'avg_nps',
       'diagnosis', 'AGE'], dtype=object)

In [15]:
missing_values_count = full_test_raw.isnull().sum()

In [16]:
missing_values_count

patient_id             0
visit_month_year       0
service_id             0
visit_count            0
city                   0
ref_type               0
ref_name               0
ref_source             0
FVD                    0
FVM                    0
gender                31
LVD                    0
brand                  0
visits_required        0
avg_nps                0
diagnosis           1117
AGE                    0
dtype: int64

In [17]:
# Missing gender 31, diagnosis 1117, fill them with 0
full_test_raw = full_test_raw.fillna(0)

In [18]:
cols_to_process = ['visit_month_year','city','ref_type','ref_name',
                     'ref_source','FVD','FVM','gender','LVD','brand',
                     'avg_nps','diagnosis','AGE']

In [19]:
# Converting into string 
full_test_raw[cols_to_process] = full_test_raw[cols_to_process].astype(str)

In [20]:
from six.moves import cPickle as pickle

In [21]:
try:
    # Loading the LabelEncoder dict objects created when training for encoding the Data
    with open('data/cols_with_classes.pickle', 'rb') as f:
        cols_with_classes = pickle.load(f)
except Exception as e:
    print('Exception ', e)

In [22]:
# Encoding the string values using the same encoding we used for encoding during training
for colm in cols_to_process:
    full_test_raw[colm] = cols_with_classes[colm].fit_transform(full_test_raw[colm])

In [23]:
full_test_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55329 entries, 0 to 55328
Data columns (total 17 columns):
patient_id          55329 non-null int64
visit_month_year    55329 non-null int64
service_id          55329 non-null int64
visit_count         55329 non-null int64
city                55329 non-null int64
ref_type            55329 non-null int64
ref_name            55329 non-null int64
ref_source          55329 non-null int64
FVD                 55329 non-null int64
FVM                 55329 non-null int64
gender              55329 non-null int64
LVD                 55329 non-null int64
brand               55329 non-null int64
visits_required     55329 non-null int64
avg_nps             55329 non-null int64
diagnosis           55329 non-null int64
AGE                 55329 non-null int64
dtypes: int64(17)
memory usage: 7.6 MB


In [24]:
full_test_raw.to_csv('data/full_test_wo_appts_cleaned_encoded.csv', index=False)

In [25]:
try:
    # Loading the classifier
    with open('data/train_svc_clf.pickle', 'rb') as f:
        clf = pickle.load(f)
except Exception as e:
    print('Exception ', e)

In [26]:
# Converting to numpy array
X_test = full_test_raw.as_matrix()

In [27]:
# Running the predictions on the data
Y_test = clf.predict(X_test)

In [42]:
len(Y_test)

55329

In [44]:
patient_id_df = pd.DataFrame(full_test_raw['patient_id'])
Y_test_df = pd.DataFrame(Y_test)

In [56]:
# Joining the predicted Labels with the patient_id
predictions = patient_id_df.join(Y_test_df).rename(columns={0: 'Bucket', 'patient_id': 'PID'})

In [57]:
predictions.sample(5)

Unnamed: 0,PID,Bucket
22983,141269,3
32897,211034,3
6990,50580,3
47735,354227,3
11660,71267,3


In [53]:
# Predictions count is more than the submission count
len(predictions)

55329

In [65]:
submission = pd.read_csv('data/Submission.csv')

In [None]:
for id, row in submission.iterrows():
    submission['Bucket'].iloc[id] = predictions.loc[predictions['PID'] == row['PID'], 'Bucket'].iloc[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [67]:
len(submission)

32455

In [68]:
submission

Unnamed: 0,PID,Bucket
0,111331,
1,27477,
2,439072,
3,346329,
4,58664,
5,378804,
6,391628,
7,31220,
8,55964,
9,72108,


In [51]:
submission = submission.drop(columns=['Bucket'])

In [52]:
# Here the count is less than the prediction count
len(submission)

32455

In [55]:
submission.columns.values

array(['PID'], dtype=object)

In [58]:
final_submission = pd.merge(submission, predictions, on='PID')

In [59]:
len(final_submission)

55329

In [60]:
missing_values_count = final_submission.isnull().sum()

In [62]:
final_submission = final_submission.drop_duplicates()

In [63]:
len(final_submission)

32455

In [64]:
final_submission

Unnamed: 0,PID,Bucket
0,111331,3
1,27477,3
2,439072,3
3,346329,3
4,58664,3
5,378804,3
7,391628,3
9,31220,3
11,55964,3
12,72108,3
