In [1]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import pandas
import numpy as np

def get_class_name_from_id(ids, mapping):
    return [mapping[id] for id in ids]

full_dataset = pandas.read_csv('language_detection_1000.csv', encoding='utf-8') 
lang_to_id = {'polish': 0, 'english': 1, 'french': 2,
              'german': 3, 'italian': 4, 'spanish': 5}
id_to_lang = {v: k for k,v in lang_to_id.items()}
full_dataset['label_num'] = full_dataset.lang.map(lang_to_id)  

np.random.seed(0)                                       
train_indices = np.random.rand(len(full_dataset)) < 0.7 

train = full_dataset[train_indices] 
test = full_dataset[~train_indices] 

pipeline = Pipeline([            
    ('tfidf', TfidfVectorizer(max_features=300, analyzer='char', ngram_range=(2,2))),
    ('scaler', StandardScaler(with_mean = False)),
    ('clf', LogisticRegression()),
])

pipeline.fit(train['text'], train['label_num']) 

print(classification_report(
    get_class_name_from_id(test['label_num'], id_to_lang), 
    get_class_name_from_id(pipeline.predict(test['text']), id_to_lang)
))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

     english       1.00      1.00      1.00       303
      french       1.00      1.00      1.00       280
      german       1.00      1.00      1.00       337
     italian       1.00      1.00      1.00       273
      polish       1.00      1.00      1.00       291
     spanish       1.00      1.00      1.00       299

    accuracy                           1.00      1783
   macro avg       1.00      1.00      1.00      1783
weighted avg       1.00      1.00      1.00      1783



In [2]:
text_to_predict = "Bonjour!"
predicted = pipeline.predict([text_to_predict])
print("Tekst: {t} został zaklasyfikowany jako: {p}".format(
    t=text_to_predict,
    p=id_to_lang[predicted[0]]
))

Tekst: Bonjour! został zaklasyfikowany jako: french
