In [1]:
import pandas as pd  # Importing the pandas library for data manipulation and analysis
import numpy as np  # Importing the numpy library for numerical operations and array handling
from sklearn.feature_extraction.text import CountVectorizer  # Importing CountVectorizer for converting text to a matrix of token counts
from sklearn.model_selection import train_test_split  # Importing train_test_split for splitting datasets into training and testing sets
from sklearn.naive_bayes import MultinomialNB  # Importing MultinomialNB for implementing the Naive Bayes classification algorithm

In [2]:
# Importing the pandas library to work with data
data = pd.read_csv(r'C:\Users\Mohhit\Desktop\CSV FILE\language.csv')  # Reading a CSV file from the specified path into a DataFrame

# Displaying the first five rows of the DataFrame to get an overview of the data
data.head()  

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


# Assistant
The selected code reads a CSV file into a Pandas DataFrame and displays the first few rows of the DataFrame. Here's a step-by-step explanation:

1. `data = pd.read_csv(r'C:\Users\Mohhit\Desktop\CSV FILE\language.csv')` - This line uses the `read_csv()` function from the Pandas library (`pd`) to read the contents of a CSV file located at the specified path. The `r` before the string indicates a raw string, which helps to handle backslashes in the file path correctly.

2. `data.head()` - This method is called on the DataFrame `data` to display the first five rows of the DataFrame. It provides a quick overview of the data structure and the first few entries, which is useful for understanding the contents of the CSV file.

In [4]:
print(data.head())

                                                Text  language
0  klement gottwaldi surnukeha palsameeriti ning ...  Estonian
1  sebes joseph pereira thomas  på eng the jesuit...   Swedish
2  ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...      Thai
3  விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...     Tamil
4  de spons behoort tot het geslacht haliclona en...     Dutch


In [5]:
data

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
...,...,...
21995,hors du terrain les années et sont des année...,French
21996,ใน พศ หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเ...,Thai
21997,con motivo de la celebración del septuagésimoq...,Spanish
21998,年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由...,Chinese


In [6]:
data.shape

(22000, 2)

In [7]:
# Calculate the total number of missing values in each column of the DataFrame 'data'
data.isnull().sum()

Text        0
language    0
dtype: int64

In [8]:
# Count the occurrences of each unique value in the 'language' column of the DataFrame 'data'
data['language'].value_counts()

language
Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: count, dtype: int64

In [9]:
# Convert the 'Text' column of the DataFrame 'data' into a NumPy array and assign it to variable x
x = np.array(data['Text'])

# Convert the 'language' column of the DataFrame 'data' into a NumPy array and assign it to variable y
y = np.array(data['language'])

# Print the contents of the array x, which contains the text data
print(x)

['klement gottwaldi surnukeha palsameeriti ning paigutati mausoleumi surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke  aastal viidi ta surnukeha mausoleumist ära ja kremeeriti zlíni linn kandis aastatel – nime gottwaldov ukrainas harkivi oblastis kandis zmiivi linn aastatel – nime gotvald'
 'sebes joseph pereira thomas  på eng the jesuits and the sino-russian treaty of nerchinsk  the diary of thomas pereira bibliotheca instituti historici s i --   rome libris '
 'ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เริ่มตั้งแต่ถนนสนามไชยถึงแม่น้ำเจ้าพระยาที่ถนนตก กรุงเทพมหานคร เป็นถนนรุ่นแรกที่ใช้เทคนิคการสร้างแบบตะวันตก ปัจจุบันผ่านพื้นที่เขตพระนคร เขตป้อมปราบศัตรูพ่าย เขตสัมพันธวงศ์ เขตบางรัก เขตสาทร และเขตบางคอแหลม'
 ...
 'con motivo de la celebración del septuagésimoquinto ° aniversario de la fundación del departamento en  guillermo ceballos espinosa presentó a la gobernación de caldas por encargo de su titular dilia estrada de gómez el h

In [10]:
print(y)

['Estonian' 'Swedish' 'Thai' ... 'Spanish' 'Chinese' 'Romanian']


In [11]:
# Initialize the CountVectorizer to convert a collection of text documents to a matrix of token counts
cv = CountVectorizer()

# Fit the CountVectorizer on the input data 'x' and transform it into a sparse matrix of token counts
X = cv.fit_transform(x)

# Split the dataset into training and testing sets
# X_train and y_train will be used for training the model
# X_test and y_test will be used for evaluating the model's performance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
X_train

<14740x277720 sparse matrix of type '<class 'numpy.int64'>'
	with 613529 stored elements in Compressed Sparse Row format>

In [13]:
print(X_train)

  (0, 197295)	2
  (0, 197708)	1
  (0, 197801)	1
  (0, 198388)	1
  (0, 197467)	1
  (0, 197865)	2
  (0, 197604)	1
  (0, 198428)	1
  (0, 198501)	1
  (0, 198556)	1
  (0, 197332)	1
  (0, 197485)	2
  (0, 198123)	1
  (0, 197892)	1
  (0, 197990)	1
  (0, 198053)	1
  (0, 198417)	1
  (0, 197623)	1
  (1, 197641)	2
  (1, 197314)	1
  (1, 197931)	1
  (1, 197804)	3
  (1, 198397)	1
  (1, 197149)	1
  (1, 197781)	1
  :	:
  (14738, 188817)	1
  (14738, 192004)	1
  (14738, 157171)	1
  (14738, 190346)	1
  (14738, 190725)	1
  (14738, 189685)	1
  (14738, 159269)	2
  (14738, 145431)	1
  (14738, 173292)	1
  (14738, 176062)	1
  (14738, 159959)	1
  (14738, 190198)	1
  (14738, 167124)	1
  (14738, 168158)	1
  (14738, 180260)	2
  (14738, 153262)	1
  (14738, 162150)	1
  (14738, 153355)	1
  (14738, 178104)	1
  (14738, 163770)	1
  (14739, 223002)	1
  (14739, 235170)	1
  (14739, 222446)	1
  (14739, 221922)	1
  (14739, 242446)	1


In [14]:
print(y_train)

['Tamil' 'Tamil' 'Arabic' ... 'Turkish' 'Pushto' 'Japanese']


In [15]:
print(X_test)

  (0, 220400)	1
  (0, 239062)	4
  (0, 242970)	1
  (0, 25038)	1
  (0, 221234)	1
  (0, 70963)	1
  (0, 240942)	1
  (0, 224990)	1
  (0, 221573)	1
  (0, 43094)	1
  (0, 248235)	1
  (0, 220472)	1
  (0, 24805)	1
  (0, 241899)	1
  (0, 240734)	1
  (0, 24810)	1
  (0, 221572)	1
  (0, 240048)	1
  (0, 219346)	1
  (0, 226397)	1
  (0, 220513)	1
  (0, 226374)	1
  (0, 241519)	1
  (0, 248305)	1
  (0, 218897)	1
  :	:
  (7259, 109162)	1
  (7259, 98689)	1
  (7259, 41967)	1
  (7259, 49056)	1
  (7259, 115442)	1
  (7259, 51250)	1
  (7259, 84417)	1
  (7259, 16171)	1
  (7259, 52768)	1
  (7259, 19221)	1
  (7259, 106599)	1
  (7259, 41845)	1
  (7259, 111204)	1
  (7259, 29829)	1
  (7259, 14605)	1
  (7259, 28982)	1
  (7259, 84272)	1
  (7259, 27781)	1
  (7259, 13964)	1
  (7259, 111206)	1
  (7259, 74001)	1
  (7259, 35990)	1
  (7259, 17979)	1
  (7259, 16178)	1
  (7259, 22637)	1


In [16]:
# Create an instance of the Multinomial Naive Bayes model
model = MultinomialNB()

# Fit the model to the training data (X_train) and corresponding labels (y_train)
model.fit(X_train, y_train)

In [17]:
model.score(X_test,y_test)

0.953168044077135

In [49]:
# Prompt the user to enter a text input
user = input('Enter a text')

# Transform the user input into a format suitable for the model using the CountVectorizer
data = cv.transform([user]).toarray()

# Use the trained model to predict the output based on the transformed input data
output = model.predict(data)

# Print the predicted output
print(output)

Enter a text Mohit este un om drăguț


['Romanian']
