In [None]:
import string
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import feature_extraction
from sklearn import pipeline
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
french_df = pd.read_csv('french.txt', header=None, names=["French"])
french_df.head()

Unnamed: 0,French
0,Le Corbeau
1,Une fois
2,fatigue
3,pendant que je donnais de la tete
4,tapotement


In [None]:
spanish_df = pd.read_csv('spanish.txt', header=None, names=["Spanish"])
spanish_df.head()

Unnamed: 0,Spanish
0,Â¿seÃ±or Soprano?
1,SÃ­.
2,Tome asiento.
3,SegÃºn me dice el doctor Cusamano...
4,...su mÃ©dico de cabecera


In [None]:
for char in string.punctuation:
  print(char,end=" ")
translate_table = dict((ord(char), None) for char in string.punctuation)

! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ 

In [None]:
data_french = []
lang_french = []
for i,line in french_df.iterrows():
  line = line['French']
  if len(line)!=0:
    line=line.lower()
    line = re.sub(r"\d+", "", line)
    # line = re.sub(r"[a-zA-Z]+", "", line)
    line = line.translate(translate_table)
    data_french.append(line)
    lang_french.append("French")

In [None]:
data_spanish = []
lang_spanish = []
for i,line in spanish_df.iterrows():
  line = line['Spanish']
  if len(line)!=0:
    line=line.lower()
    line = re.sub(r"\d+", "", line)
    # line = re.sub(r"[a-zA-Z]+", "", line)
    line = line.translate(translate_table)
    data_spanish.append(line)
    lang_spanish.append("Spanish")

In [None]:
df = pd.DataFrame({"Text": data_french+data_spanish,
                   "language": lang_french+lang_spanish})

In [None]:
df.shape

(1401, 2)

In [None]:
df.head()

Unnamed: 0,Text,language
0,le corbeau,French
1,une fois,French
2,fatigue,French
3,pendant que je donnais de la tete,French
4,tapotement,French


In [None]:
X,y = df.iloc[:,0],df.iloc[:,1]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,
                                                    random_state=0)

In [None]:
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,3), analyzer='char')

pipe_lr_r13 = pipeline.Pipeline([
    ('vectorizer', vectorizer),
    ('clf', linear_model.LogisticRegression())
])

In [None]:
pipe_lr_r13.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                ('clf', LogisticRegression())])

In [None]:
y_predicted = pipe_lr_r13.predict(X_test)

In [None]:
acc = (metrics.accuracy_score(y_test, y_predicted))*100
print(acc, '%')

91.10320284697508 %
