In [104]:
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree


In [2]:
train_file_path = "./data/train.csv"
train_data_cols = ["id", "keyword", "location", "text"]
train_label = "target"
train_cols = [train_label] + train_data_cols
test_file_path = "./data/test.csv"
test_cols = train_data_cols
    

In [3]:
train_df = pd.read_csv(train_file_path, usecols=train_cols)
test_df = pd.read_csv(test_file_path, usecols=test_cols)

In [4]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [126]:
def data_clean(text: str):
	text = re.sub('<[^<]+?>', ' ', text)

	text = text.replace('\\"', '')

	text = text.replace('\n', ' ')

	text = text.replace('\t', ' ')

	text = text.replace('"', '')

	text = text.translate(str.maketrans('', '', string.punctuation))

	text = re.sub(' +', ' ', text)

	text = re.sub('\d+', '0', text)

	text = text.lower()

	return text

def data_preprocessing(train_data: pd.DataFrame,  test_data: pd.DataFrame, label_col: str, text_column: str):
    cleaned_train_data = [data_clean(w) for w in train_data[text_column]]
    cleaned_test_data = [data_clean(w) for w in test_data[text_column]]
    
    x_train, x_validaion, y_train, y_validaion = train_test_split(
        cleaned_train_data, train_data[label_col], test_size=0.1, random_state=255, shuffle=True)

    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 3), norm='l2')

    tfidf_train = tfidf_vectorizer.fit_transform(x_train)
    
    tfidf_validation = tfidf_vectorizer.transform(x_validaion)

    tfidf_test = tfidf_vectorizer.transform(cleaned_test_data)

    return tfidf_train, tfidf_validation, y_train, y_validaion, tfidf_test


In [129]:
tfidf_train, tfidf_validation, y_train, y_validaion,  tfidf_test = data_preprocessing(train_df, test_df, 'target', 'text')
tfidf_train.shape, tfidf_test.shape, tfidf_validation.shape

  (0, 37149)	0.1913895318089874
  (0, 31343)	0.1913895318089874
  (0, 24770)	0.1913895318089874
  (0, 76139)	0.1913895318089874
  (0, 9373)	0.1913895318089874
  (0, 23726)	0.1913895318089874
  (0, 13501)	0.1913895318089874
  (0, 102818)	0.1913895318089874
  (0, 13577)	0.1913895318089874
  (0, 36597)	0.1913895318089874
  (0, 68942)	0.1913895318089874
  (0, 37148)	0.1913895318089874
  (0, 31342)	0.1913895318089874
  (0, 24769)	0.1913895318089874
  (0, 76138)	0.1913895318089874
  (0, 9372)	0.1913895318089874
  (0, 23725)	0.1913895318089874
  (0, 13500)	0.1913895318089874
  (0, 102817)	0.1913895318089874
  (0, 13576)	0.1828983933088555
  (0, 36596)	0.1913895318089874
  (0, 58086)	0.11881104609696473
  (0, 68925)	0.14024257359236528
  (0, 37092)	0.12440380131173821
  (0, 31322)	0.12483560473923737
  (0, 24751)	0.13145011011973975
  (0, 76111)	0.11947591858339354
  (0, 9371)	0.1913895318089874
  (0, 23722)	0.13467829115462568
  (0, 102808)	0.12918393106659462
  (0, 13468)	0.23211122058014372

In [128]:
linear_svc = LinearSVC(random_state=55, loss='hinge')

classifier = CalibratedClassifierCV(linear_svc, method='sigmoid', cv=2)

classifier.fit(tfidf_train, y_train)

pred_validation = classifier.predict(tfidf_validation)
accuracy = metrics.accuracy_score(y_validaion, pred_validation)

print("accuracy", accuracy)


pred = classifier.predict(tfidf_test)

with open('output.csv', 'w') as file:
    _str = ','.join(['id', 'target'])
    file.write(_str + '\n')

    for i in range(len(pred)):
        _str = ','.join([str(test_df['id'][i]), str(pred[i])])
        file.write(_str)
        file.write('\n')

print("done")

accuracy 0.7979002624671916
done


In [102]:
classifier.base_estimator

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=55, tol=0.0001, verbose=0)