In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split



In [2]:
df = pd.read_csv('/Users/lilixu/Desktop/training_data_2.csv').dropna(how='all').rename(columns={'favourites_count': 'favorites_count', 'listedcount': 'listed_count'})

In [3]:
def derive_features(df):
    df['name_has_bot'] = df['name'].str.lower().str.contains('bot')
    df['sname_has_bot'] = df['screen_name'].str.lower().str.contains('bot')
    df['dp_has_bot'] = df['description'].str.lower().str.contains('bot').fillna(False)
    df['st_has_bot'] = df['status'].str.lower().str.contains('bot').fillna(False)
    df['loc_has_bot'] = df['location'].str.lower().str.contains('bot').fillna(False)

    df['dp_has_alert'] = df['description'].str.lower().str.contains('alert').fillna(False)
    df['name_has_alert'] = df['name'].str.lower().str.contains('alert').fillna(False)
    df['sname_has_alert'] = df['screen_name'].str.lower().str.contains('alert').fillna(False)

    df['dp_has_at'] = df['description'].str.lower().str.contains('@').fillna(False)
    df['dp_has_pound'] = df['description'].str.lower().str.contains('#').fillna(False)
    df['dp_has_every'] = df['description'].str.lower().str.contains('every').fillna(False)
    df['dp_has_auto'] = df['description'].str.lower().str.contains('auto').fillna(False)

    df['is_en'] = df['lang'].str.lower().str.contains('en').fillna(True)

    df['has_loc'] = df['location'].fillna(False).astype(bool)
    df['has_dp'] = df['description'].fillna(False).astype(bool)
    df['has_url'] = [False if type(x) != str or x =='None' or x == 'null' else True for x in df['url']]
    df['has_st'] = [False if type(x) != str or x =='None' or x == 'null' else True for x in df['status']]
    
    #df['st_is_asc'] = [False if (type(x) is str and not all(ord(chr) < 128 for chr in x)) else True for x in df['status']]
    df['dp_is_asc'] = [False if (type(x) is str and not all(ord(chr) < 128 for chr in x)) or not all(ord(chr) < 128 for chr in y) else True for x, y in zip(df['description'],df['name'])]
    
    df['count_dp'] = [len(x.split(' ')) if type(x) is str else 0 for x in df['description']]
    df['len_dp'] = [len(x) if type(x) is str else 0 for x in df['description']]
    df['len_name'] = [len(x) for x in df['name']]
    df['len_sname'] = [len(x) for x in df['screen_name']]
    df['len_loc'] = [len(x) if type(x) is str else 0 for x in df['location']]
    
    temp = []
    for x in df['created_at']:
        for year in range(2006,2018):
            if str(year) in x:
                temp.append(year)
    df['year']= temp

In [4]:
derive_features(df)

In [5]:
df["default_profile"] = df["default_profile"].fillna(False)
df["default_profile_image"] = df["default_profile_image"].fillna(False)
df["has_extended_profile"] = df["has_extended_profile"].fillna(False)
df['verified'] = df["verified"].fillna(False)

In [6]:
numfeatures = ['followers_count','friends_count','listed_count','favorites_count','statuses_count',
               "default_profile","default_profile_image","has_extended_profile", 'verified',
               'name_has_bot', 'sname_has_bot','dp_has_bot', 'st_has_bot','loc_has_bot',
               'dp_has_alert','name_has_alert', 'sname_has_alert',
               'dp_has_at','dp_has_every','dp_has_auto', 'dp_has_pound',
               'dp_is_asc',
               'is_en', 'has_loc','has_url','has_dp', 'has_st',
               'len_dp', 'len_sname', 'len_loc','len_name','year',
              ]

### Test

In [7]:
def rf():
    train, test = train_test_split(df, test_size = 0.2)
    random_forest = RandomForestClassifier(n_estimators=200)
    random_forest.fit(train[numfeatures],train['bot'])
    return random_forest.score(test[numfeatures],test['bot'])
res = [rf() for _ in range(20)]

In [8]:
train, test = train_test_split(df, test_size = 0.2)
random_forest = RandomForestClassifier(n_estimators=500)
random_forest.fit(train[numfeatures],train['bot'])
error = test[random_forest.predict(test[numfeatures]) != test['bot']]

# Submission

In [9]:
df2 = pd.read_csv('/Users/lilixu/Desktop/test_data_4_students.csv').dropna(how='all')

In [10]:
derive_features(df2)

In [11]:
for x in ["default_profile","default_profile_image","has_extended_profile", 'verified']:
    temp = []
    for y in df2[x]:
        if y == 'TRUE':
            temp.append(True)
        else:
            temp.append(False)
    df2[x] = temp

for x in ['followers_count', 'friends_count', 'listed_count','favorites_count','statuses_count']:
    df2[x] = pd.to_numeric(df2[x],errors='coerce')
    df2[x] = df2[x].fillna(df2[x].median())

In [12]:
for _ in range(50):
    random_forest = RandomForestClassifier(n_estimators=random.randint(100,500))
    random_forest.fit(df[numfeatures],df['bot'])
    df2['bot'] = df2['bot'] | random_forest.predict(df2[numfeatures]) 

submission_k = pd.DataFrame({
        'id': df2['﻿id'].astype(int),
        "bot": df2['bot'].astype(int)
    })
submission_k.to_csv('/Users/lilixu/Desktop/submission_k.csv',header=True, index=False)