In [49]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.datasets import make_classification

'''
A sci-kit learn inspired script to convert pandas dataframes into libFFM style data.
The script is fairly hacky (hey thats Kaggle) and takes a little while to run a huge dataset.
The key to using this class is setting up the features dtypes correctly for output (ammend transform to suit your needs)
Example below
'''


class FFMFormatPandas:
    def __init__(self):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        for col in df.columns:
            vals = df[col].unique()
            for val in vals:
                if pd.isnull(val):
                    continue
                name = '{}_{}'.format(col, val)
                if name not in self.feature_index_:
                    self.feature_index_[name] = last_idx
                    last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
            if col_type.kind ==  'O':
                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
            elif col_type.kind == 'i':
                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})

########################### Lets build some data and test ############################
### 

'''
train, y = make_classification(n_samples=100, n_features=5, n_informative=2, n_redundant=2, n_classes=2, random_state=42)

train=pd.DataFrame(train, columns=['int1','int2','int3','s1','s2'])
train['int1'] = train['int1'].map(int)
train['int2'] = train['int2'].map(int)
train['int3'] = train['int3'].map(int)
train['s1'] = round(np.log(abs(train['s1'] +1 ))).map(str)
train['s2'] = round(np.log(abs(train['s2'] +1 ))).map(str)
train['clicked'] = y

print("Original train data set")
print(train)

ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(train, y='clicked')
print('Base data')
print(train[0:10])
print('FFM data')
print(ffm_train_data[0:10])
'''

dpath = '../'
train = pd.read_csv(dpath +"tr.csv")


def oneHotForFeatures(df, columnName):
    onehot = pd.get_dummies(df[columnName],prefix=columnName)
    df = df.drop(columnName, axis =1)
    df = df.join(onehot)
    return df

train = train[["C17"]].join(train[["C18"]]).join(train[["Label"]])
print(train)

train = oneHotForFeatures(train,"C17")
train = oneHotForFeatures(train,"C18")

print(train)

ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(train)

#print("FFM format data as follows:")
#print(ffm_train_data)

import sys
sys.path.append("../../../utils")
from DataUtils import *
import os

csvfile = os.getcwd() + "/../tr.csv"
print(csvfile)

writeToFFMFile( csvfile, "./tr.ffm", "Label")


        




           C17       C18  Label
0     e5ba7672  f54016b9      0
1     07c540c4  b04e4670      0
2     8efede7f  3412118d      0
3     1e88c74f  74ef3502      0
4     1e88c74f  26b3c7a7      0
5     776ce399  92555263      0
6     776ce399  cdfa8259      0
7     e5ba7672  74ef3502      1
8     e5ba7672  42a2edb9      0
9     d4bb7bd8  70d0f5f9      0
10    776ce399  0b331314      0
11    d4bb7bd8  8aaa5b67      0
12    e5ba7672  e5f8f18f      1
13    07c540c4  891589e7      1
14    3486227d  12195b22      0
15    776ce399  3a2028fd      0
16    d4bb7bd8  582152eb      1
17    e5ba7672  d0e5eb07      0
18    e5ba7672  b04e4670      0
19    e5ba7672  5edd90de      0
20    e5ba7672  1999bae9      1
21    d4bb7bd8  5aed7436      0
22    07c540c4  5d93f8ab      0
23    1e88c74f  698d1c68      0
24    d4bb7bd8  e7648a8f      0
25    07c540c4  5aed7436      0
26    27c07bd6  1f868fdd      1
27    3486227d  02e8d897      1
28    e5ba7672  df00d249      1
29    e5ba7672  7181ccc8      1
...     

/root/github/machinelearningstepping/10.CTR/kaggle-criteo-gbdt-ffm/converters/../tr.csv
