In [2]:
#k

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV


In [4]:
df = pd.read_excel('/Users/asus/Desktop/term6/ML/train_data.xlsx', header=None, names=['text', 'label'])


In [5]:
import re

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  
    text = text.strip()  
    text = text.lower()  
    return text

df['text'] = df['text'].apply(clean_text)


In [6]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text'])
y = df['label']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)


In [9]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5116751269035533
Classification Report:
               precision    recall  f1-score   support

       ANGRY       0.40      0.43      0.41       187
        FEAR       0.45      0.40      0.42        65
       HAPPY       0.75      0.66      0.70       325
       OTHER       0.43      0.49      0.45       242
         SAD       0.40      0.39      0.39       166

    accuracy                           0.51       985
   macro avg       0.48      0.47      0.48       985
weighted avg       0.53      0.51      0.52       985



In [10]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
best_clf = grid_search.best_estimator_


Best parameters found:  {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 20}


In [11]:
y_pred_best = best_clf.predict(X_test)
print("Optimized Accuracy:", accuracy_score(y_test, y_pred_best))
print("Optimized Classification Report:\n", classification_report(y_test, y_pred_best))


Optimized Accuracy: 0.5116751269035533
Optimized Classification Report:
               precision    recall  f1-score   support

       ANGRY       0.38      0.45      0.41       187
        FEAR       0.49      0.40      0.44        65
       HAPPY       0.80      0.65      0.72       325
       OTHER       0.42      0.51      0.46       242
         SAD       0.40      0.37      0.38       166

    accuracy                           0.51       985
   macro avg       0.50      0.47      0.48       985
weighted avg       0.54      0.51      0.52       985



In [12]:
pruned_clf = DecisionTreeClassifier(max_depth=best_clf.max_depth, 
                                    min_samples_split=best_clf.min_samples_split, 
                                    min_samples_leaf=best_clf.min_samples_leaf,
                                    ccp_alpha=0.01)  
pruned_clf.fit(X_train, y_train)

y_pred_pruned = pruned_clf.predict(X_test)
print("Pruned Model Accuracy:", accuracy_score(y_test, y_pred_pruned))
print("Pruned Model Classification Report:\n", classification_report(y_test, y_pred_pruned))


Pruned Model Accuracy: 0.4050761421319797
Pruned Model Classification Report:
               precision    recall  f1-score   support

       ANGRY       0.93      0.13      0.23       187
        FEAR       1.00      0.22      0.35        65
       HAPPY       0.94      0.37      0.53       325
       OTHER       0.29      1.00      0.46       242
         SAD       0.00      0.00      0.00       166

    accuracy                           0.41       985
   macro avg       0.63      0.34      0.31       985
weighted avg       0.62      0.41      0.35       985



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)
y_pred_nb = nb_clf.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.5868020304568528
Naive Bayes Classification Report:
               precision    recall  f1-score   support

       ANGRY       0.57      0.38      0.46       187
        FEAR       0.00      0.00      0.00        65
       HAPPY       0.70      0.83      0.76       325
       OTHER       0.46      0.70      0.55       242
         SAD       0.67      0.42      0.51       166

    accuracy                           0.59       985
   macro avg       0.48      0.46      0.46       985
weighted avg       0.56      0.59      0.56       985



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
svm_clf = SVC(kernel='linear')
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.617258883248731
SVM Classification Report:
               precision    recall  f1-score   support

       ANGRY       0.54      0.57      0.55       187
        FEAR       0.71      0.38      0.50        65
       HAPPY       0.87      0.74      0.80       325
       OTHER       0.46      0.65      0.54       242
         SAD       0.58      0.48      0.52       166

    accuracy                           0.62       985
   macro avg       0.63      0.56      0.58       985
weighted avg       0.65      0.62      0.62       985



In [15]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.5989847715736041
Random Forest Classification Report:
               precision    recall  f1-score   support

       ANGRY       0.60      0.40      0.48       187
        FEAR       0.72      0.35      0.47        65
       HAPPY       0.85      0.72      0.78       325
       OTHER       0.43      0.80      0.56       242
         SAD       0.59      0.38      0.46       166

    accuracy                           0.60       985
   macro avg       0.64      0.53      0.55       985
weighted avg       0.65      0.60      0.60       985



In [16]:
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

grid_search_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

best_dt_clf = grid_search_dt.best_estimator_
y_pred_best_dt = best_dt_clf.predict(X_test)
print("Optimized Decision Tree Accuracy:", accuracy_score(y_test, y_pred_best_dt))
print("Optimized Decision Tree Classification Report:\n", classification_report(y_test, y_pred_best_dt))


Optimized Decision Tree Accuracy: 0.5035532994923858
Optimized Decision Tree Classification Report:
               precision    recall  f1-score   support

       ANGRY       0.36      0.42      0.39       187
        FEAR       0.51      0.42      0.46        65
       HAPPY       0.80      0.65      0.72       325
       OTHER       0.40      0.48      0.44       242
         SAD       0.39      0.37      0.38       166

    accuracy                           0.50       985
   macro avg       0.49      0.47      0.48       985
weighted avg       0.53      0.50      0.51       985



In [17]:
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf']
}

grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

best_svm_clf = grid_search_svm.best_estimator_
y_pred_best_svm = best_svm_clf.predict(X_test)
print("Optimized SVM Accuracy:", accuracy_score(y_test, y_pred_best_svm))
print("Optimized SVM Classification Report:\n", classification_report(y_test, y_pred_best_svm))




Optimized SVM Accuracy: 0.6294416243654822
Optimized SVM Classification Report:
               precision    recall  f1-score   support

       ANGRY       0.59      0.56      0.57       187
        FEAR       0.77      0.31      0.44        65
       HAPPY       0.88      0.74      0.80       325
       OTHER       0.48      0.72      0.57       242
         SAD       0.57      0.49      0.53       166

    accuracy                           0.63       985
   macro avg       0.66      0.56      0.58       985
weighted avg       0.67      0.63      0.63       985



In [18]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

best_rf_clf = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf_clf.predict(X_test)
print("Optimized Random Forest Accuracy:", accuracy_score(y_test, y_pred_best_rf))
print("Optimized Random Forest Classification Report:\n", classification_report(y_test, y_pred_best_rf))


Optimized Random Forest Accuracy: 0.583756345177665
Optimized Random Forest Classification Report:
               precision    recall  f1-score   support

       ANGRY       0.56      0.34      0.43       187
        FEAR       0.73      0.34      0.46        65
       HAPPY       0.82      0.73      0.77       325
       OTHER       0.44      0.79      0.56       242
         SAD       0.52      0.37      0.44       166

    accuracy                           0.58       985
   macro avg       0.61      0.51      0.53       985
weighted avg       0.62      0.58      0.58       985



In [19]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


cv_results_dt = cross_val_score(best_dt_clf, X, y, cv=kfold, scoring='accuracy')
print("Cross-Validated Decision Tree Accuracy: %.2f%%" % (cv_results_dt.mean()*100.0))


cv_results_svm = cross_val_score(best_svm_clf, X, y, cv=kfold, scoring='accuracy')
print("Cross-Validated SVM Accuracy: %.2f%%" % (cv_results_svm.mean()*100.0))


cv_results_rf = cross_val_score(best_rf_clf, X, y, cv=kfold, scoring='accuracy')
print("Cross-Validated Random Forest Accuracy: %.2f%%" % (cv_results_rf.mean()*100.0))


Cross-Validated Decision Tree Accuracy: 50.79%
Cross-Validated SVM Accuracy: 61.33%
Cross-Validated Random Forest Accuracy: 58.29%


In [1]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import re
import nltk


nltk.download('stopwords')


persian_stopwords = list(["و", "در", "به", "از", "که", "این", "را", "با", "است", "برای", "آن", "تا", "می", "شود", "وی", "بر", "بود", "ها", "ای", "کرد", "نیز", "هم", "اگر", "ما", "یا", "هر", "همه", "او", "یک", "چه", "کنند", "باید", "نه"])


def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  
    text = text.strip()  
    text = text.lower()  
    return text


df_train = pd.read_excel('/Users/asus/Desktop/term6/ML/train_data.xlsx', header=None, names=['text', 'label'])
df_test = pd.read_csv('/Users/asus/Desktop/term6/ML/3rdHW_test.csv', header=None, names=['text'])


df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)


vectorizer = TfidfVectorizer(stop_words=persian_stopwords, ngram_range=(1, 2), max_features=5000)
X = vectorizer.fit_transform(df_train['text'])
X_test = vectorizer.transform(df_test['text'])
y = df_train['label']


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline = Pipeline([
    ('classifier', LogisticRegression(solver='liblinear'))
])


param_grid = {
    'classifier__C': [0.1, 1, 10],
}


grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)


best_model = grid_search.best_estimator_


y_val_pred = best_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))


test_predictions = best_model.predict(X_test)
df_test['predictions'] = test_predictions


df_test[['text', 'predictions']].to_csv('/Users/asus/Desktop/term6/ML/test_predictions2.csv', index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Best parameters found:  {'classifier__C': 10}
Validation Accuracy: 0.6142131979695431
Validation Classification Report:
               precision    recall  f1-score   support

       ANGRY       0.55      0.56      0.55       187
        FEAR       0.68      0.42      0.51        65
       HAPPY       0.81      0.76      0.78       325
       OTHER       0.49      0.61      0.54       242
         SAD       0.53      0.49      0.51       166

    accuracy                           0.61       985
   macro avg       0.61      0.56      0.58       985
weighted avg       0.63      0.61      0.62       985



### Explanation and Interpretation of Each Section of the Code

1. **Importing Libraries:**

    ```python
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import classification_report, accuracy_score
    from sklearn.pipeline import Pipeline
    import re
    import nltk
    ```

    In this section, the necessary libraries for data analysis and model building are imported. `pandas` and `numpy` are used for data manipulation, `sklearn` for modeling and evaluation, and `re` for text cleaning. Additionally, `nltk` is used for handling stopwords.

2. **Downloading Stopwords:**

    ```python
    
    nltk.download('stopwords')
    ```

    Here, the default stopwords from `nltk` are downloaded to be used in cleaning the data.

3. **Defining Persian Stopwords:**

    ```python
    
    persian_stopwords = list(["و", "در", "به", "از", "که", "این", "را", "با", "است", "برای", "آن", "تا", "می", "شود", "وی", "بر", "بود", "ها", "ای", "کرد", "نیز", "هم", "اگر", "ما", "یا", "هر", "همه", "او", "یک", "چه", "کنند", "باید", "نه"])
    ```

    In this section, a set of Persian stopwords is defined which are commonly found in Persian texts and need to be removed.

4. **Defining Text Cleaning Function:**

    ```python
    
    def clean_text(text):
        text = re.sub(r'[^\w\s]', '', text)  
        text = text.strip()  
        text = text.lower()  
        return text
    ```

    This function is responsible for cleaning the texts:
    - Removing punctuation
    - Removing extra whitespaces
    - Converting letters to lowercase

5. **Loading Data:**

    ```python
    
    df_train = pd.read_excel('/mnt/data/train_data.xlsx', header=None, names=['text', 'label'])
    df_test = pd.read_csv('/mnt/data/3rdHW_test.csv', header=None, names=['text'])
    ```

    Training and testing data are loaded from Excel and CSV files.

6. **Cleaning Data:**

    ```python
    
    df_train['text'] = df_train['text'].apply(clean_text)
    df_test['text'] = df_test['text'].apply(clean_text)
    ```

    In this section, the `clean_text` function is applied to the text columns of the data to clean the texts.

7. **Vectorizing Texts:**

    ```python
    
    vectorizer = TfidfVectorizer(stop_words=persian_stopwords, ngram_range=(1, 2), max_features=5000)
    X = vectorizer.fit_transform(df_train['text'])
    X_test = vectorizer.transform(df_test['text'])
    y = df_train['label']
    ```

    In this section, `TfidfVectorizer` is used to convert texts into TF-IDF vectors. These vectors represent the importance of words in the text. Persian stopwords and 1- to 2-grams are also used. `max_features` is limited to 5000 to reduce the number of features and execution time.

8. **Splitting Data into Training and Validation Sets:**

    ```python
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    ```

    Here, the training data is split into training and validation sets to train and then validate the model.

9. **Defining and Setting Up the Pipeline:**

    ```python
    
    pipeline = Pipeline([
        ('classifier', LogisticRegression(solver='liblinear'))
    ])
    ```

    In this section, a pipeline is defined which includes a Logistic Regression model. Using `Pipeline` allows us to manage the vectorization and model training process as a single unit.

10. **Defining the Parameter Grid (Grid Search):**

    ```python
    
    param_grid = {
        'classifier__C': [0.1, 1, 10],
    }
    ```

    Here, a grid of parameters for tuning the Logistic Regression model is defined. The `C` parameter, which controls the regularization strength, is tested with different values.

11. **Grid Search with Cross-Validation:**

    ```python
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print("Best parameters found: ", grid_search.best_params_)
    ```

    In this section, `GridSearchCV` is used to find the best parameters using cross-validation. The number of folds is reduced to 3 to decrease execution time.

12. **Best Model:**

    ```python
    
    best_model = grid_search.best_estimator_
    ```

    Here, the best model obtained from `GridSearchCV` is extracted.

13. **Evaluating Model Performance on Validation Set:**

    ```python
    
    y_val_pred = best_model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
    print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))
    ```

    In this section, the model is evaluated on the validation data, and the accuracy and classification report are printed.

14. **Predicting on Test Set:**

    ```python
    
    test_predictions = best_model.predict(X_test)
    df_test['predictions'] = test_predictions
    ```

    Here, the best model is applied to the test data without labels, and the prediction results are added to the test data file.

15. **Saving Predictions:**

    ```python
    
    df_test[['text', 'predictions']].to_csv('/mnt/data/test_predictions.csv', index=False)
    ```

    Finally, the model's predictions on the test data are saved in a CSV file.

### Why These Sections are Used?

- **Data Preprocessing**: To remove noise and improve data quality.
- **Text Vectorization**: To convert texts into a format that machine learning models can work with.
- **Data Splitting**: To evaluate the model and prevent overfitting.
- **Pipeline**: To integrate preprocessing and modeling steps.
- **Grid Search**: To find the best parameters and optimize model performance.
- **Simpler Models**: To reduce execution time and simplify the process.

