In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'traincsv2:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4921962%2F8286807%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240504%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240504T085511Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5e69f08fb0d98cdc6226e7ac87f6863c4745f45feafe83b218e2f9edc74375b6e9e954a8c86369cd8fa772753fefd12fa13c499e4ff2910ef9c12e692ffc09ee78917d5ab05595f10d41a35cd91e2532a044acd4a1771e0c5af337ced83c6a4beb61ebd906e7564dad5926229520f9235397341bb547051ada6f629974da4e69ee133246e7bc6ea5c18ce7b6dda579a4cd35078e99662142411ab8a9ad6b994f3a8cd7939eab2c9f78afcba179e1c456797c969b29c6d7a12b7aa1fb6d85ade4bfb5ee2f30830d25e94e08e3291b0ffad34efa109921823bfe5203845adbe8d26570bb5ba21787622f0a38d16de48d974b2d7cda0347877dc76279a063123d5c,testdata:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4936288%2F8309916%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240504%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240504T085511Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9dc3f57d60e55eb2ccbaa51c4c27791a60a68186e1a90b90cf35e2829519b42c79e4c928a153ae04ddd5e6960a007664e72084592148e1cd4127d16c176bb03cebd01f3c99fd9f00984b40e2ad2b94c6e1853fb11aaeaa22869115d95e09ce3affb07bd71b0827a69f61b5cd4dd6d5080051213a70d7234371d182408de181a684019dce7baa0193d55dc009078b478a3235d75ba24a0a44520e9dc0ac5390724e92ead2dd1fe6165177904f1fe00a4012c097e49a2b7ca3917840213733276d5095022141897b1ad1fecd56e9b453d6f72bfbb754e71a1c7bcbccfc1bff0e918ae85816ef2c3fa5dc37e40adf633cc4a9834a8be1356632ecd9c36236f6f273'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading traincsv2, 55761876 bytes compressed
Downloaded and uncompressed: traincsv2
Downloading testdata, 13902150 bytes compressed
Downloaded and uncompressed: testdata
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/testdata/test.csv
/kaggle/input/traincsv2/train.csv


In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import csv

In [61]:
train_data = pd.read_csv('/kaggle/input/traincsv2/train.csv')

In [62]:
train_data.shape

(87398, 17)

In [63]:
train_data.head(10)

Unnamed: 0,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,total_quantity,total_price
0,22aa8e52e6268788a9e5c10932da6f54,Mrs.,NJ,2016-06-20 23:05:15,Grades 3-5,Literacy & Language,"Literacy, Literature & Writing",To Infinity and Beyond...Let's Edit!,"\""Can you be our teacher next year, you made S...",I would love for my students to have the oppor...,,,My students need a document camera to assist i...,0,1,1,264.99
1,59d70f9999d4b63a2c95c42d61d8468c,Ms.,GA,2016-09-01 08:11:20,Grades PreK-2,"Literacy & Language, Special Needs","Literacy, Special Needs",Reading With Our Ears?!,Living in poverty often means lack of access t...,Can you imagine a child not having the simple ...,,,My students need a listening center with headp...,12,1,5,847.0
2,168074f8c5e0bec129596844eab8c53d,Mr.,TX,2016-05-09 11:40:49,Grades 9-12,Applied Learning,Extracurricular,Debate Materials for Upcoming Year,We are a High School Speech and Debate Team. W...,My students are a mixture of male and female. ...,They will use the materials for basic function...,"We have to beg, borrow and steal materials fro...",My students need basic supplies for the upcomi...,1,1,13,222.2
3,2ac6e7a8b435d3fa6dc3c53a910de7e0,Ms.,CA,2016-11-27 18:15:15,Grades PreK-2,"Literacy & Language, Special Needs","Literacy, Special Needs",Help letter sounds come to life for my students!,My students are a fun bunch of diverse kids wh...,Alphabet and sound tubs will allow sounds to c...,,,My students need letter sound tubs to allow th...,0,1,5,318.97
4,6492fd8d2fe4961e207facd59b807c39,Ms.,LA,2016-07-07 19:20:49,Grades 3-5,Literacy & Language,Literacy,Using Technology to Develop Comprehension Skil...,My students are working very hard to increase ...,I want to began the school year with a place w...,,,My students need a large class rug to feel fre...,0,0,25,606.41
5,43016a259c80bfd25b6c33653b18a0cb,Teacher,IN,2017-01-11 15:49:50,Grades PreK-2,"Literacy & Language, Math & Science","Literacy, Mathematics",An I-Pad Opens Your Eye to Learning,What makes my students special is that they al...,Your donation to our project will improve my k...,,,My students need I-Pads to help them be succes...,0,1,5,377.99
6,16f5d3707003abc5e998352285500fb3,Mrs.,NY,2016-12-04 13:36:41,Grades PreK-2,"Literacy & Language, Math & Science","Literacy, Mathematics",Let's Boogie & Write!,My students are excited to come to school each...,"My students use whiteboards, dry erase markers...",,,My students need boogie boards to use during s...,3,1,7,45.28
7,d6f8c96acbbfa1401d64337699b3da43,Ms.,TX,2016-07-02 15:40:47,Grades 3-5,Math & Science,Mathematics,Mathematicians Manipulating Manipulatives,I teach at a Title I school where 100 percent ...,Math is not the same as it once was. Students ...,,,"My students need magnetic place value blocks, ...",2,1,10,138.95
8,6814fe8452729ecbe4acb9f8177bae4c,Mrs.,SC,2016-06-14 16:31:45,Grades 6-8,Math & Science,"Applied Sciences, Environmental Science",Save the Penguins STEM Project!,I teach middle school students in the outskirt...,I teach 6th grade science and all year my stud...,,,My students need materials for STEM engineerin...,0,0,86,412.76
9,1d832518dafa3ad6217446ceb63c79f1,Mr.,GA,2016-08-17 21:37:24,Grades 6-8,Math & Science,Applied Sciences,Keep STEAM Fun!,"My students are your typical 6th, 7th, and 8th...",Every student loves to hold and touch what the...,,,My students need a way to learn coding that ta...,6,1,28,177.4


In [64]:
train_data.isna().sum()

teacher_id                                          0
teacher_prefix                                      3
school_state                                        0
project_submitted_datetime                          0
project_grade_category                              0
project_subject_categories                          0
project_subject_subcategories                       0
project_title                                       0
project_essay_1                                     0
project_essay_2                                     0
project_essay_3                                 84325
project_essay_4                                 84325
project_resource_summary                            0
teacher_number_of_previously_posted_projects        0
project_is_approved                                 0
total_quantity                                      0
total_price                                         0
dtype: int64

In [65]:
train_data.drop(columns=['teacher_id','teacher_prefix','school_state','project_submitted_datetime'], inplace=True)

In [66]:
train_data.isna().sum()

project_grade_category                              0
project_subject_categories                          0
project_subject_subcategories                       0
project_title                                       0
project_essay_1                                     0
project_essay_2                                     0
project_essay_3                                 84325
project_essay_4                                 84325
project_resource_summary                            0
teacher_number_of_previously_posted_projects        0
project_is_approved                                 0
total_quantity                                      0
total_price                                         0
dtype: int64

In [67]:
train_data['project_essay_3'] = train_data['project_essay_3'].fillna('')
train_data['project_essay_4'] = train_data['project_essay_4'].fillna('')

In [68]:
train_data.isna().sum()

project_grade_category                          0
project_subject_categories                      0
project_subject_subcategories                   0
project_title                                   0
project_essay_1                                 0
project_essay_2                                 0
project_essay_3                                 0
project_essay_4                                 0
project_resource_summary                        0
teacher_number_of_previously_posted_projects    0
project_is_approved                             0
total_quantity                                  0
total_price                                     0
dtype: int64

In [69]:
train_data['project_is_approved'].value_counts()

project_is_approved
1    74217
0    13181
Name: count, dtype: int64

In [70]:
from sklearn import preprocessing
LE = preprocessing.LabelEncoder()

train_data['project_grade_category'] = LE.fit_transform(train_data['project_grade_category'])
train_data['project_subject_categories'] = LE.fit_transform(train_data['project_subject_categories'])
train_data['project_subject_subcategories'] = LE.fit_transform(train_data['project_subject_subcategories'])

print("Unique values for project_grade_category : ",train_data.project_grade_category.unique())
print("Unique values for project_subject_categories : ",train_data.project_subject_categories.unique())
print("Unique values for project_subject_subcategories : ",train_data.project_subject_subcategories.unique())
print("Unique values for project_subject_subcategories : ",train_data.project_subject_subcategories.unique())
print("Unique values for teacher_number_of_previously_posted_projects : ",train_data.teacher_number_of_previously_posted_projects.unique())
print("Unique values for teacher_number_of_previously_posted_projects : ",train_data.teacher_number_of_previously_posted_projects.unique())
print("Unique values for total_quantity : ",train_data.total_quantity.unique())
print("Unique values for total_price : ",train_data.total_price.unique())

Unique values for project_grade_category :  [0 3 2 1]
Unique values for project_subject_categories :  [24 30  0 28 32  5 33 46 40 38  3  8 50 29 36 22 48  1 19 35  6 13  4 11
 25 27  9 34 16 14  2 37 12 44 21 20 26 47 49  7 17 39 42 10 43 41 15 18
 31 23 45]
Unique values for project_subject_subcategories :  [311 319 205 310 312 335   8   0 162 136 122   3  18 385 376 278 157 391
 266 389 393 325 323 288 324  16  13  25 282 382 331 146 387 154 346  69
  17 145 321 191 251 264  91 134 333 103 365 253 287 192 349 294  92 189
 165 161 268 285 284 121 316 318 242 342 289 298 313 190  67 237  73  24
 299 235 184 293 244 171  20 143 120 194  27  42 222 361 156 317   9 123
   5 158 330 203 133 286 243 297  77 223 351 275  45 269 308 259 174 135
 167  68  90  37 230  50 305 329 336 343 315 344  47 320 211 271 101  62
  44 380 354 267 124  29 381 386 148  46 262 265  72  87 147  15  31 270
 213   4 255 113 183 216  19  53 152 107  57 300 169 137  99  21 193 388
  95 258  14 153 367 129  61 378 

In [71]:
train_data.head(10)

Unnamed: 0,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,total_quantity,total_price
0,0,24,311,To Infinity and Beyond...Let's Edit!,"\""Can you be our teacher next year, you made S...",I would love for my students to have the oppor...,,,My students need a document camera to assist i...,0,1,1,264.99
1,3,30,319,Reading With Our Ears?!,Living in poverty often means lack of access t...,Can you imagine a child not having the simple ...,,,My students need a listening center with headp...,12,1,5,847.0
2,2,0,205,Debate Materials for Upcoming Year,We are a High School Speech and Debate Team. W...,My students are a mixture of male and female. ...,They will use the materials for basic function...,"We have to beg, borrow and steal materials fro...",My students need basic supplies for the upcomi...,1,1,13,222.2
3,3,30,319,Help letter sounds come to life for my students!,My students are a fun bunch of diverse kids wh...,Alphabet and sound tubs will allow sounds to c...,,,My students need letter sound tubs to allow th...,0,1,5,318.97
4,0,24,310,Using Technology to Develop Comprehension Skil...,My students are working very hard to increase ...,I want to began the school year with a place w...,,,My students need a large class rug to feel fre...,0,0,25,606.41
5,3,28,312,An I-Pad Opens Your Eye to Learning,What makes my students special is that they al...,Your donation to our project will improve my k...,,,My students need I-Pads to help them be succes...,0,1,5,377.99
6,3,28,312,Let's Boogie & Write!,My students are excited to come to school each...,"My students use whiteboards, dry erase markers...",,,My students need boogie boards to use during s...,3,1,7,45.28
7,0,32,335,Mathematicians Manipulating Manipulatives,I teach at a Title I school where 100 percent ...,Math is not the same as it once was. Students ...,,,"My students need magnetic place value blocks, ...",2,1,10,138.95
8,1,32,8,Save the Penguins STEM Project!,I teach middle school students in the outskirt...,I teach 6th grade science and all year my stud...,,,My students need materials for STEM engineerin...,0,0,86,412.76
9,1,32,0,Keep STEAM Fun!,"My students are your typical 6th, 7th, and 8th...",Every student loves to hold and touch what the...,,,My students need a way to learn coding that ta...,6,1,28,177.4


In [27]:
pip install nltk



In [None]:
#nltk.download('all')

In [28]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [29]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [72]:

def stem_text(text):
    stemmer = PorterStemmer()
    tokenized = word_tokenize(text)
    return ' '.join([stemmer.stem(word) for word in tokenized])

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokenized = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokenized])

In [31]:
!unzip -o /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


unzip:  cannot find or open /usr/share/nltk_data/corpora/wordnet.zip, /usr/share/nltk_data/corpora/wordnet.zip.zip or /usr/share/nltk_data/corpora/wordnet.zip.ZIP.


In [None]:
"""
data['combined_essay'] = data['project_essay_1'].fillna('') + " " + \
                         data['project_essay_2'].fillna('') + " " + \
                         data['project_essay_3'].fillna('') + " " + \
                         data['project_essay_4'].fillna('') + " " + \
                         data['project_resource_summary'].fillna('')
"""

In [73]:
#print(0)

#data['combined_essay'] = data['combined_essay'].astype(str).apply(stem_text)
#data['combined_essay'] = data['combined_essay'].astype(str).apply(lemmatize_text)

#data.drop(columns=['project_essay_1','project_essay_2','project_essay_3','project_essay_4', 'project_resource_summary'], inplace=True)

#print(1)


print(0)

train_data['project_essay_1'] = train_data['project_essay_1'].astype(str).apply(stem_text)
train_data['project_essay_1'] = train_data['project_essay_1'].astype(str).apply(lemmatize_text)

print(1)

train_data['project_essay_2'] = train_data['project_essay_2'].astype(str).apply(stem_text)
train_data['project_essay_2'] = train_data['project_essay_2'].astype(str).apply(lemmatize_text)

print(2)

train_data['project_essay_3'] = train_data['project_essay_3'].astype(str).apply(stem_text)
train_data['project_essay_3'] = train_data['project_essay_3'].astype(str).apply(lemmatize_text)

print(3)

train_data['project_essay_4'] = train_data['project_essay_4'].astype(str).apply(stem_text)
train_data['project_essay_4'] = train_data['project_essay_4'].astype(str).apply(lemmatize_text)

print(4)


train_data['project_resource_summary'] = train_data['project_resource_summary'].astype(str).apply(stem_text)
train_data['project_resource_summary'] = train_data['project_resource_summary'].astype(str).apply(lemmatize_text)

print(5)

train_data['project_title'] = train_data['project_title'].astype(str).apply(stem_text)
train_data['project_title'] = train_data['project_title'].astype(str).apply(lemmatize_text)


print(6)


0
1
2
3
4
5
6


In [74]:
train_data.isna().sum()

project_grade_category                          0
project_subject_categories                      0
project_subject_subcategories                   0
project_title                                   0
project_essay_1                                 0
project_essay_2                                 0
project_essay_3                                 0
project_essay_4                                 0
project_resource_summary                        0
teacher_number_of_previously_posted_projects    0
project_is_approved                             0
total_quantity                                  0
total_price                                     0
dtype: int64

In [75]:
train_data.head(10)

Unnamed: 0,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,total_quantity,total_price
0,0,24,311,to infin and beyond ... let 's edit !,"\ `` can you be our teacher next year , you ma...",i would love for my student to have the opport...,,,my student need a document camera to assist in...,0,1,1,264.99
1,3,30,319,read with our ear ? !,live in poverti often mean lack of access to t...,can you imagin a child not have the simpl plea...,,,my student need a listen center with headphon ...,12,1,5,847.0
2,2,0,205,debat materi for upcom year,we are a high school speech and debat team . w...,my student are a mixtur of male and femal . we...,they will use the materi for basic function of...,"we have to beg , borrow and steal materi from ...",my student need basic suppli for the upcom com...,1,1,13,222.2
3,3,30,319,help letter sound come to life for my student !,my student are a fun bunch of diver kid who lo...,alphabet and sound tub will allow sound to com...,,,my student need letter sound tub to allow them...,0,1,5,318.97
4,0,24,310,use technolog to develop comprehens skill with...,my student are work veri hard to increas their...,i want to began the school year with a place w...,,,my student need a larg class rug to feel free ...,0,0,25,606.41
5,3,28,312,an i-pad open your eye to learn,what make my student special is that they all ...,your donat to our project will improv my k-2 s...,,,my student need i-pad to help them be success ...,0,1,5,377.99
6,3,28,312,let 's boogi & write !,my student are excit to come to school each da...,"my student use whiteboard , dri era marker , p...",,,my student need boogi board to use dure small ...,3,1,7,45.28
7,0,32,335,mathematician manipul manipul,i teach at a titl i school where 100 percent o...,math is not the same a it onc wa . student now...,,,"my student need magnet place valu block , magn...",2,1,10,138.95
8,1,32,8,save the penguin stem project !,i teach middl school student in the outskirt o...,i teach 6th grade scienc and all year my stude...,,,my student need materi for stem engin project ...,0,0,86,412.76
9,1,32,0,keep steam fun !,"my student are your typic 6th , 7th , and 8th ...",everi student love to hold and touch what they...,,,my student need a way to learn code that take ...,6,1,28,177.4


In [76]:
y = train_data['project_is_approved'].values

In [77]:
train_data.drop(columns=['project_is_approved'], inplace=True)

In [79]:
train_data.head(5)

Unnamed: 0,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,total_quantity,total_price
0,0,24,311,to infin and beyond ... let 's edit !,"\ `` can you be our teacher next year , you ma...",i would love for my student to have the opport...,,,my student need a document camera to assist in...,0,1,264.99
1,3,30,319,read with our ear ? !,live in poverti often mean lack of access to t...,can you imagin a child not have the simpl plea...,,,my student need a listen center with headphon ...,12,5,847.0
2,2,0,205,debat materi for upcom year,we are a high school speech and debat team . w...,my student are a mixtur of male and femal . we...,they will use the materi for basic function of...,"we have to beg , borrow and steal materi from ...",my student need basic suppli for the upcom com...,1,13,222.2
3,3,30,319,help letter sound come to life for my student !,my student are a fun bunch of diver kid who lo...,alphabet and sound tub will allow sound to com...,,,my student need letter sound tub to allow them...,0,5,318.97
4,0,24,310,use technolog to develop comprehens skill with...,my student are work veri hard to increas their...,i want to began the school year with a place w...,,,my student need a larg class rug to feel free ...,0,25,606.41


In [80]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import scipy.sparse

scaler = StandardScaler()

#'project_subject_categories','project_subject_subcategories','teacher_number_of_previously_posted_projects',
                        #'total_quantity'

tfidf_vectorizer = TfidfVectorizer()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, ['total_price']),
        ('tfidf1', TfidfVectorizer(), 'project_essay_1'),
        ('tfidf2', TfidfVectorizer(), 'project_essay_2'),
        ('tfidf3', TfidfVectorizer(), 'project_title'),
        ('tfidf4', TfidfVectorizer(), 'project_resource_summary'),
        ('tfidf5', TfidfVectorizer(), 'project_essay_3'),
        ('tfidf6', TfidfVectorizer(), 'project_essay_4')
    ],
    remainder='passthrough'
)

"""
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, ['total_price']),
        ('tfidf1', TfidfVectorizer(), 'combined_essay'),
        ('tfidf3', TfidfVectorizer(), 'project_title')

    ],
    remainder='passthrough'
)
"""

"\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('num', scaler, ['total_price']),\n        ('tfidf1', TfidfVectorizer(), 'combined_essay'),\n        ('tfidf3', TfidfVectorizer(), 'project_title')\n        \n    ],\n    remainder='passthrough'\n)\n"

In [81]:
X_train_transformed = preprocessor.fit_transform(train_data)

#if isinstance(X_transformed, scipy.sparse.csr.csr_matrix):
   # X_transformed = X_transformed.toarray()
print('transform done')

transform done


In [82]:
print(X_train_transformed[:1])

  (0, 0)	-0.08984157131114846
  (0, 28)	0.07228229642585852
  (0, 428)	0.06866691310482467
  (0, 536)	0.06715020078442388
  (0, 1050)	0.04220629204986503
  (0, 1338)	0.0673918834731455
  (0, 1345)	0.06002472557318807
  (0, 1381)	0.08607564964692796
  (0, 1596)	0.03835149600572773
  (0, 1666)	0.06266913458343912
  (0, 1691)	0.10011575833526312
  (0, 1874)	0.07586013898269957
  (0, 2092)	0.11512651670173807
  (0, 2125)	0.07876045892967544
  (0, 2652)	0.04912409690884617
  (0, 2724)	0.049393125185424325
  (0, 3485)	0.16412509316547916
  (0, 3714)	0.0391270900083389
  (0, 3851)	0.060671586201399175
  (0, 4080)	0.08720442404778954
  (0, 4183)	0.10376678607523124
  (0, 4998)	0.06430795860661478
  (0, 5170)	0.060037461571631374
  (0, 5207)	0.10615667332192405
  (0, 5505)	0.08945797504427065
  :	:
  (0, 59817)	0.061449174481923566
  (0, 59934)	0.2933387835240739
  (0, 60502)	0.20036564661160763
  (0, 61079)	0.4485868883026018
  (0, 63199)	0.5421175559341035
  (0, 65128)	0.5740704022369629
  (0

In [83]:
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Assuming you have X and y already defined
X_train, X_test, y_train, y_test = train_test_split(X_train_transformed, y, test_size=30, random_state=40)

unique, counts = np.unique(y_train, return_counts=True)
print("Original class distribution:", dict(zip(unique, counts)))





Original class distribution: {0: 13176, 1: 74192}


In [84]:

# Applying SMOTE
smote = SMOTE(random_state=42)
#X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)
#unique_smote, counts_smote = np.unique(y_resampled_smote, return_counts=True)
#print("SMOTE class distribution:", dict(zip(unique_smote, counts_smote)))

In [85]:
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

# Applying Random Under-Sampling
rus = RandomUnderSampler(random_state=42)
X_resampled_rus, y_resampled_rus = rus.fit_resample(X_train, y_train)
unique_rus, counts_rus = np.unique(y_resampled_rus, return_counts=True)
print("Random Under-Sampling class distribution:", dict(zip(unique_rus, counts_rus)))

Random Under-Sampling class distribution: {0: 13176, 1: 13176}


In [86]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

print('XG Start')

xgb_clf.fit(X_resampled_rus, y_resampled_rus)
y_pred_xgb = xgb_clf.predict(X_test)
print('XG Done')

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.2f}")

# Optionally print a detailed classification report
print(classification_report(y_test, y_pred_xgb))

XG Start
XG Done
XGBoost Accuracy: 0.70
              precision    recall  f1-score   support

           0       0.30      0.60      0.40         5
           1       0.90      0.72      0.80        25

    accuracy                           0.70        30
   macro avg       0.60      0.66      0.60        30
weighted avg       0.80      0.70      0.73        30



In [87]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42, max_iter=1000)
print('LR Start')

lr.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = lr.predict(X_test)
print('LR Done')


accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Accuracy: {accuracy:.2f}")

LR Start
LR Done
Accuracy: 0.80


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [88]:
import xgboost as xgb

xgb_clf_final = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

print('XG Start')

xgb_clf_final.fit(X_train, y_train)
y_pred_xgb = xgb_clf_final.predict(X_test)
print('XG Done')

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.2f}")

# Optionally print a detailed classification report
print(classification_report(y_test, y_pred_xgb))

XG Start
XG Done
XGBoost Accuracy: 0.87
              precision    recall  f1-score   support

           0       1.00      0.20      0.33         5
           1       0.86      1.00      0.93        25

    accuracy                           0.87        30
   macro avg       0.93      0.60      0.63        30
weighted avg       0.89      0.87      0.83        30



In [None]:
"""
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Setup the grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', n_jobs=1, cv=3, verbose=1)

print('XG Fit')

# Fit grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


best_model = grid_search.best_estimator_
y_pred_xgb = best_model.predict(X_test)
print('XG Done')

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.2f}")
"""

'\nfrom sklearn.model_selection import GridSearchCV\nfrom xgboost import XGBClassifier\n\n# Define the model\nxgb_model = XGBClassifier(use_label_encoder=False, eval_metric=\'mlogloss\')\n\n# Define the parameter grid\nparam_grid = {\n    \'max_depth\': [3, 5, 7],\n    \'min_child_weight\': [1, 3, 5],\n    \'subsample\': [0.6, 0.8, 1.0],\n    \'colsample_bytree\': [0.6, 0.8, 1.0],\n    \'n_estimators\': [100, 200],\n    \'learning_rate\': [0.01, 0.1, 0.2]\n}\n\n# Setup the grid search\ngrid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring=\'accuracy\', n_jobs=1, cv=3, verbose=1)\n\nprint(\'XG Fit\')\n\n# Fit grid search\ngrid_search.fit(X_train, y_train)\n\n# Best parameters and best score\nprint("Best parameters:", grid_search.best_params_)\nprint("Best score:", grid_search.best_score_)\n\n\nbest_model = grid_search.best_estimator_\ny_pred_xgb = best_model.predict(X_test)\nprint(\'XG Done\')\n\naccuracy_xgb = accuracy_score(y_test, y_pred_xgb)\nprint(f"XGBoost

In [None]:
"""
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


rfc = RandomForestClassifier(criterion='entropy',random_state=42)

#('Lr',LogisticRegression(max_iter=1000))

est = [('KNN',KNN(3)),('DT',DecisionTreeClassifier(max_depth=5)),('sv',SVC(kernel='linear')), ('rfc',RandomForestClassifier(criterion='entropy',random_state=42))]
hard_vot = VotingClassifier(estimators=est,voting='hard')
hard_vot.fit(X_train, y_train)
y_pred_h = hard_vot.predict(X_test)
print('hard vote accuracy')
accuracy_score(y_pred_h,y_test)


est_soft = [('KNN',KNN(3)),('DT',DecisionTreeClassifier(max_depth=5)),('sv',SVC(kernel='linear',probability=True)), ('rfc',RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=42))]

soft_vot = VotingClassifier(estimators=est_soft,voting='soft')
soft_vot.fit(X_train, y_train)
y_pred_h = soft_vot.predict(X_test)
print('soft vote accuracy')
accuracy_score(y_pred_h,y_test)
"""

NameError: name 'X_train' is not defined

In [89]:
test_data = pd.read_csv('/kaggle/input/testdata/test.csv')

In [90]:
test_data.shape

(21850, 16)

In [91]:
test_data.head(5)

Unnamed: 0,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,total_quantity,total_price
0,771652318ce616e8bfd8417395c38d8c,Mrs.,CT,2017-03-04 14:53:47,Grades PreK-2,"Literacy & Language, Special Needs","Literacy, Special Needs",Hot Off the Press: School to Home Connection f...,A picture is worth a thousand words is more th...,Pictures are so important for my students. We ...,,,"My students need a a mobile color printer, ink...",202,10,409.83
1,2269be528cdd948ba1d016e1430c005d,Mrs.,MT,2016-05-19 15:10:41,Grades 9-12,Music & The Arts,Visual Arts,We can see clearly now!,My students live in a rural agricultural town ...,"One way students learn how to draw, paint, or ...",,,My students need an interactive document camer...,0,1,649.99
2,815744a4821c7b9647d396386193d88a,Mrs.,ME,2016-04-27 11:25:48,Grades PreK-2,Literacy & Language,Literature & Writing,"Dear Mrs. Perry, We Want Bean Bag Chairs!",I have a classroom full of very convincing wri...,Our classroom is made up of 15 hardworking and...,We are asking for four extra large bean bag ch...,These bean bags will give my students more tha...,My students need to know how powerful writing ...,4,4,169.86
3,253f13fa0a9160b34886a82de1849e0c,Mrs.,TX,2016-10-12 07:04:19,Grades PreK-2,Literacy & Language,Literacy,Back to Basics?,"My school is a Title 1 school, and it is a low...",We live in a technology based world where comp...,,,My students need good technology in our classr...,7,2,181.75
4,603cad453c76050359ae6e9247a5b4ca,Mrs.,CO,2017-03-09 11:24:57,Grades 6-8,Literacy & Language,Literacy,Exciting Books for Struggling Readers at the M...,"Your hands are sweaty, and your heart is beati...",By the time that struggling readers make it to...,,,My students need exciting and engaging books a...,0,21,22.62


In [92]:
test_data.isna().sum()

teacher_id                                          0
teacher_prefix                                      0
school_state                                        0
project_submitted_datetime                          0
project_grade_category                              0
project_subject_categories                          0
project_subject_subcategories                       0
project_title                                       0
project_essay_1                                     0
project_essay_2                                     0
project_essay_3                                 21165
project_essay_4                                 21165
project_resource_summary                            0
teacher_number_of_previously_posted_projects        0
total_quantity                                      0
total_price                                         0
dtype: int64

In [93]:
test_data.drop(columns=['teacher_id','teacher_prefix','school_state','project_submitted_datetime'], inplace=True)

In [94]:
test_data.isna().sum()

project_grade_category                              0
project_subject_categories                          0
project_subject_subcategories                       0
project_title                                       0
project_essay_1                                     0
project_essay_2                                     0
project_essay_3                                 21165
project_essay_4                                 21165
project_resource_summary                            0
teacher_number_of_previously_posted_projects        0
total_quantity                                      0
total_price                                         0
dtype: int64

In [95]:
test_data['project_essay_3'] = test_data['project_essay_3'].fillna('')
test_data['project_essay_4'] = test_data['project_essay_4'].fillna('')

In [96]:
test_data.isna().sum()

project_grade_category                          0
project_subject_categories                      0
project_subject_subcategories                   0
project_title                                   0
project_essay_1                                 0
project_essay_2                                 0
project_essay_3                                 0
project_essay_4                                 0
project_resource_summary                        0
teacher_number_of_previously_posted_projects    0
total_quantity                                  0
total_price                                     0
dtype: int64

In [97]:
test_data['project_grade_category'] = LE.fit_transform(test_data['project_grade_category'])
test_data['project_subject_categories'] = LE.fit_transform(test_data['project_subject_categories'])
test_data['project_subject_subcategories'] = LE.fit_transform(test_data['project_subject_subcategories'])

print("Unique values for project_grade_category : ",test_data.project_grade_category.unique())
print("Unique values for project_subject_categories : ",test_data.project_subject_categories.unique())
print("Unique values for project_subject_subcategories : ",test_data.project_subject_subcategories.unique())
print("Unique values for project_subject_subcategories : ",test_data.project_subject_subcategories.unique())
print("Unique values for teacher_number_of_previously_posted_projects : ",test_data.teacher_number_of_previously_posted_projects.unique())
print("Unique values for teacher_number_of_previously_posted_projects : ",test_data.teacher_number_of_previously_posted_projects.unique())
print("Unique values for total_quantity : ",test_data.total_quantity.unique())
print("Unique values for total_price : ",test_data.total_price.unique())

Unique values for project_grade_category :  [3 2 1 0]
Unique values for project_subject_categories :  [29 39 23 27  8 35  0 31 44 16 19  6 48 37 28 26 32  4  1  3 36 34 14  5
 11 24 17 33 46 43 13  2 47 12 42 22 10 21  9 20 25 45 18 41 40 15  7 38
 30]
Unique values for project_subject_subcategories :  [276 334 279 267 269 214 268  15 230 280 178 290 329  17 203 263  61 315
 335 164 255 297 274 257 131 227 275 215 147 169   3 120   0 286  84 134
 224 301  38  25 325 166 251 304 187 122 159 245 278 244 256 145  30 242
  12  36 282 110 175 248  64  26 136 163 313 231 241   8 206 239 322  83
 299  39 238   6  70  90 291 333 260 167  46  16 140 288 331 138 308 232
  19 129 177 103 305 228 219  86 139 186 207 121 112  37  95 102  82   4
   9 247 332 246  23 285  91  21 323 168  48 294 270  22 264 243 272 266
 150 119 133 249 200 309  94  79  87 252  45  50 199 327   2 174 143 130
 223 148 326 273  66  27 320 281 196 152 283  32  78  73 307 189 165  40
 105 284  60 115 128 220  35  47 132 12

In [98]:
test_data.head(10)

Unnamed: 0,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,total_quantity,total_price
0,3,29,276,Hot Off the Press: School to Home Connection f...,A picture is worth a thousand words is more th...,Pictures are so important for my students. We ...,,,"My students need a a mobile color printer, ink...",202,10,409.83
1,2,39,334,We can see clearly now!,My students live in a rural agricultural town ...,"One way students learn how to draw, paint, or ...",,,My students need an interactive document camer...,0,1,649.99
2,3,23,279,"Dear Mrs. Perry, We Want Bean Bag Chairs!",I have a classroom full of very convincing wri...,Our classroom is made up of 15 hardworking and...,We are asking for four extra large bean bag ch...,These bean bags will give my students more tha...,My students need to know how powerful writing ...,4,4,169.86
3,3,23,267,Back to Basics?,"My school is a Title 1 school, and it is a low...",We live in a technology based world where comp...,,,My students need good technology in our classr...,7,2,181.75
4,1,23,267,Exciting Books for Struggling Readers at the M...,"Your hands are sweaty, and your heart is beati...",By the time that struggling readers make it to...,,,My students need exciting and engaging books a...,0,21,22.62
5,0,27,269,When being in 5th is better than 1st.,Our students come from all places and differen...,These Chromebooks will allow my students to wo...,,,My students need more technology resources to ...,11,4,119.99
6,3,8,214,Please help us store amazing playground equipm...,My students are enthusiastic and engaged in al...,Earlier this year we were essentially able to ...,,,My students need a durable storage unit for al...,8,1,176.59
7,0,27,269,Can You SEE Me Now?!?!,"Every morning, my students walk into our class...",The LCD projector and accessories would make a...,,,"My students need an LCD projector, protective ...",4,3,777.98
8,0,23,268,Dictionaries Needed for Title 1 Students,"I am a 3rd, 4th, and 5th grade reading teacher...",We do not have proper dictionaries for our stu...,I would like to have 25-30 student dictionarie...,Your donation would assist those ESL students ...,My students need a complete class set of dicti...,0,3,47.5
9,0,35,15,iCan Explore My World With My iPad,My students attend an elementary school in a h...,My students need an iPad to help them explore ...,,,My students need an iPad Air 2 with a protecti...,33,2,553.16


In [99]:
print(0)

test_data['project_essay_1'] = test_data['project_essay_1'].astype(str).apply(stem_text)
test_data['project_essay_1'] = test_data['project_essay_1'].astype(str).apply(lemmatize_text)

print(1)

test_data['project_essay_2'] = test_data['project_essay_2'].astype(str).apply(stem_text)
test_data['project_essay_2'] = test_data['project_essay_2'].astype(str).apply(lemmatize_text)

print(2)

test_data['project_essay_3'] = test_data['project_essay_3'].astype(str).apply(stem_text)
test_data['project_essay_3'] = test_data['project_essay_3'].astype(str).apply(lemmatize_text)

print(3)

test_data['project_essay_4'] = test_data['project_essay_4'].astype(str).apply(stem_text)
test_data['project_essay_4'] = test_data['project_essay_4'].astype(str).apply(lemmatize_text)

print(4)


test_data['project_resource_summary'] = test_data['project_resource_summary'].astype(str).apply(stem_text)
test_data['project_resource_summary'] = test_data['project_resource_summary'].astype(str).apply(lemmatize_text)

print(5)

test_data['project_title'] = test_data['project_title'].astype(str).apply(stem_text)
test_data['project_title'] = test_data['project_title'].astype(str).apply(lemmatize_text)


print(6)

0
1
2
3
4
5
6


In [100]:
test_data.isna().sum()

project_grade_category                          0
project_subject_categories                      0
project_subject_subcategories                   0
project_title                                   0
project_essay_1                                 0
project_essay_2                                 0
project_essay_3                                 0
project_essay_4                                 0
project_resource_summary                        0
teacher_number_of_previously_posted_projects    0
total_quantity                                  0
total_price                                     0
dtype: int64

In [101]:
test_data.head()

Unnamed: 0,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,total_quantity,total_price
0,3,29,276,hot off the press : school to home connect for...,a pictur is worth a thousand word is more than...,pictur are so import for my student . we use t...,,,"my student need a a mobil color printer , ink ...",202,10,409.83
1,2,39,334,we can see clearli now !,my student live in a rural agricultur town ( p...,"one way student learn how to draw , paint , or...",,,my student need an interact document camera to...,0,1,649.99
2,3,23,279,"dear mrs. perri , we want bean bag chair !",i have a classroom full of veri convinc writer...,our classroom is made up of 15 hardwork and in...,we are ask for four extra larg bean bag chair ...,these bean bag will give my student more than ...,my student need to know how power write can be...,4,4,169.86
3,3,23,267,back to basic ?,"my school is a titl 1 school , and it is a low...",we live in a technolog base world where comput...,,,my student need good technolog in our classroo...,7,2,181.75
4,1,23,267,excit book for struggl reader at the middl sch...,"your hand are sweati , and your heart is beat ...",by the time that struggl reader make it to mid...,,,my student need excit and engag book at their ...,0,21,22.62


In [102]:
train_data.head()

Unnamed: 0,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,total_quantity,total_price
0,0,24,311,to infin and beyond ... let 's edit !,"\ `` can you be our teacher next year , you ma...",i would love for my student to have the opport...,,,my student need a document camera to assist in...,0,1,264.99
1,3,30,319,read with our ear ? !,live in poverti often mean lack of access to t...,can you imagin a child not have the simpl plea...,,,my student need a listen center with headphon ...,12,5,847.0
2,2,0,205,debat materi for upcom year,we are a high school speech and debat team . w...,my student are a mixtur of male and femal . we...,they will use the materi for basic function of...,"we have to beg , borrow and steal materi from ...",my student need basic suppli for the upcom com...,1,13,222.2
3,3,30,319,help letter sound come to life for my student !,my student are a fun bunch of diver kid who lo...,alphabet and sound tub will allow sound to com...,,,my student need letter sound tub to allow them...,0,5,318.97
4,0,24,310,use technolog to develop comprehens skill with...,my student are work veri hard to increas their...,i want to began the school year with a place w...,,,my student need a larg class rug to feel free ...,0,25,606.41


In [103]:
X_test_transformed = preprocessor.transform(test_data)

#if isinstance(X_transformed, scipy.sparse.csr.csr_matrix):
   # X_transformed = X_transformed.toarray()
print('test transform done')

test transform done


In [104]:
print(X_test_transformed[:1])

  (0, 0)	0.30435561968989194
  (0, 482)	0.242931854772195
  (0, 1362)	0.07491525123739078
  (0, 1511)	0.03555000552675944
  (0, 1596)	0.05005694687599927
  (0, 1666)	0.040898346444419684
  (0, 1691)	0.16800761179177137
  (0, 1977)	0.08199784643035013
  (0, 2276)	0.3111568429587744
  (0, 2819)	0.08213386232015021
  (0, 3388)	0.0646488021475074
  (0, 4210)	0.047163272194178096
  (0, 4227)	0.08254012325851016
  (0, 4610)	0.04238885942798879
  (0, 4953)	0.2856129557324067
  (0, 4961)	0.10929013084051425
  (0, 5711)	0.07558894262895693
  (0, 5800)	0.06326698288190856
  (0, 6355)	0.05059874291891627
  (0, 6428)	0.10440212478976969
  (0, 6622)	0.11895093696752046
  (0, 8222)	0.04862081235744843
  (0, 8734)	0.027795705103146134
  (0, 8880)	0.04257365946954464
  (0, 8953)	0.062080958196327474
  :	:
  (0, 64795)	0.33102501946566776
  (0, 64850)	0.4049328043347854
  (0, 67586)	0.3576014268334035
  (0, 68382)	0.4656103921820651
  (0, 69267)	0.26850950762180503
  (0, 70614)	0.1594611696576087
  (0,

In [105]:
print('XG Test')
y_pred_test = xgb_clf_final.predict(X_test_transformed)
print('XG Done')

XG Test
XG Done


In [108]:
print(y_pred_test)

[1 1 1 ... 1 1 1]


In [112]:
predictions_df = pd.DataFrame({'predicted_label': y_pred_test})
# Save the predictions to a CSV file without indexes
predictions_df.to_csv('predict_se22uari098.csv', index=False)

In [113]:
predictions_df.value_counts()

predicted_label
1                  21051
0                    799
Name: count, dtype: int64