In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1Ô∏è‚É£ Load Train & Test
train_df = pd.read_csv("Train_data (1).csv")
test_df = pd.read_csv("Test_data.csv")



print("Train Columns:", train_df.columns)
print("Test Columns:", test_df.columns)

# 2Ô∏è‚É£ Use correct columns
X = train_df['data']        # ‚úÖ page text
y = train_df['tagging']     # ‚úÖ label (pandl, bs, etc.)

# 3Ô∏è‚É£ Split into Train & Validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4Ô∏è‚É£ TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec   = vectorizer.transform(X_val)

# 5Ô∏è‚É£ Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# 6Ô∏è‚É£ Validate
y_val_pred = model.predict(X_val_vec)
print("\n‚úÖ Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

# 7Ô∏è‚É£ Predict Test Data
X_test_vec = vectorizer.transform(test_df['data'])
test_pred  = model.predict(X_test_vec)

# 8Ô∏è‚É£ Save Submission
submission = pd.DataFrame({
    'row_id': test_df['row_id'],      # ‚úÖ test ‡§ï‡§æ row_id
    'predicted_class': test_pred
})
submission.to_csv("submission.csv", index=False)
print("\n‚úÖ submission.csv created successfully!")


Train Columns: Index(['fname', 'page_number', 'data', 'tagging'], dtype='object')
Test Columns: Index(['row_id', 'fname', 'page_number', 'data'], dtype='object')

‚úÖ Validation Accuracy: 0.9401709401709402

Classification Report:
               precision    recall  f1-score   support

          bs       1.00      0.62      0.76        13
    negative       0.95      0.99      0.97      1280
    notes_bs       0.86      0.76      0.81       169
 notes_pandl       0.78      0.40      0.53        45
       pandl       0.92      0.86      0.89        14

    accuracy                           0.94      1521
   macro avg       0.90      0.72      0.79      1521
weighted avg       0.94      0.94      0.94      1521


‚úÖ submission.csv created successfully!


In [2]:
# ‚úÖ ‡§ï‡•ã‡§à ‡§≠‡•Ä sample text ‡§¶‡•ã
sample_text = [
    "This page contains the profit and loss summary for the company.",
    "The balance sheet includes assets, liabilities and equity details."
]

# ‚úÖ ‡§™‡§π‡§≤‡•á ‡§â‡§∏‡•á TF-IDF ‡§Æ‡•á‡§Ç ‡§¨‡§¶‡§≤‡•ã
sample_vec = vectorizer.transform(sample_text)

# ‚úÖ Prediction ‡§≤‡•ã
sample_pred = model.predict(sample_vec)

# ‚úÖ Result print ‡§ï‡§∞‡•ã
for text, pred in zip(sample_text, sample_pred):
    print(f"\nüìÑ Text: {text[:50]}...")
    print(f"üëâ Predicted Class: {pred}")



üìÑ Text: This page contains the profit and loss summary for...
üëâ Predicted Class: negative

üìÑ Text: The balance sheet includes assets, liabilities and...
üëâ Predicted Class: negative


In [16]:
# import os
# print(os.getcwd())


C:\Users\Lenovo\omkar_project


In [17]:
# import pandas as pd

# # 1Ô∏è‚É£ Load Train & Test
# train_df = pd.read_csv("Train_data (1).csv")
# test_df = pd.read_csv("Test_data.csv")





# print("Train shape:", train_df.shape)
# print("Test shape:", test_df.shape)

# print("\nTrain columns:", train_df.columns.tolist())
# print("Test columns:", test_df.columns.tolist())

# # Peek at the data
# print("\nSample Train Data:")
# print(train_df.head(2))

# print("\nSample Test Data:")
# print(test_df.head(2))


Train shape: (7602, 4)
Test shape: (3259, 4)

Train columns: ['fname', 'page_number', 'data', 'tagging']
Test columns: ['row_id', 'fname', 'page_number', 'data']

Sample Train Data:
                                               fname  page_number  \
0                      ITCAnnualreport-accounts-2015           62   
1  ShriRam City UnionAnnualSCUF_Annual_Reports_20...          181   

                                                data   tagging  
0  Report Management For Your ended SOCIO-ECONOMI...  negative  
1  Consolidated NOTES forming A. B. 36. S.noParti...  negative  

Sample Test Data:
   row_id                                    fname  page_number  \
0       1  Jet AirwaysAnnualJet_Airways_AR_2017-18          143   
1       2         icici-bank-annual-report-2015-16          181   

                                                data  
0  Notes iii. * Market Market in related Jet The ...  
1  Schedules Financial forming 40. 40A. 41. 42. D...  


In [3]:
# train_and_save_model.py

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train_df = pd.read_csv("Train_data (1).csv")
X = train_df["data"]
y = train_df["tagging"]

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# ‚úÖ Save model & vectorizer
joblib.dump(model, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("‚úÖ Model & vectorizer saved!")


‚úÖ Model & vectorizer saved!


In [2]:
# app.py

import streamlit as st
import joblib

# üîÅ Load model and vectorizer
model = joblib.load("model.pkl")
vectorizer = joblib.load("vectorizer.pkl")

# üé® UI
st.set_page_config(page_title="Text Classifier", layout="centered")
st.title("üìÑ Sample Text Prediction")

sample_texts = [
    "This page contains the profit and loss summary for the company.",
    "The balance sheet includes assets, liabilities and equity details."
]

st.subheader("üìù Sample Texts")
for i, txt in enumerate(sample_texts, 1):
    st.markdown(f"**Sample {i}:** {txt}")

if st.button("üîÆ Predict Classes"):
    sample_vec = vectorizer.transform(sample_texts)
    sample_pred = model.predict(sample_vec)

    st.subheader("‚úÖ Predictions")
    for text, pred in zip(sample_texts, sample_pred):
        st.markdown(f"**üìÑ Text:** {text[:60]}...")
        st.markdown(f"üëâ **Predicted Class:** `{pred}`")
        st.markdown("---")


