In [None]:
import pandas as pd
df = pd.read_parquet('test.parquet')
df.to_csv('test.csv')

In [None]:
import pandas as pd
df = pd.read_parquet('train.parquet')
df.to_csv('train.csv')

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Load your pre-downloaded train and test data (assuming they are in CSV format)
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Define stopwords list
stop_words = set(stopwords.words('english'))

# Text cleaning function
def clean_text(text):
  text = text.lower()  # Lowercase
  text = ''.join([char for char in text if char.isalnum() or char == ' '])  # Remove punctuation
  tokens = word_tokenize(text)  # Tokenize
  filtered_words = [w for w in tokens if w not in stop_words]  # Remove stopwords
  return ' '.join(filtered_words)

# Clean text in both datasets
train_data['text'] = train_data['text'].apply(clean_text)
test_data['text'] = test_data['text'].apply(clean_text)

# Feature Engineering (choose BoW or TF-IDF)
vectorizer = CountVectorizer(max_features=1000)  # Example for BoW with 1000 features
# vectorizer = TfidfVectorizer(max_features=1000)  # Example for TF-IDF with 1000 features
train_features = vectorizer.fit_transform(train_data['text'])
test_features = vectorizer.transform(test_data['text'])


In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer  # Import Stemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB  # Example Classifier

# Load your pre-downloaded train and test data (assuming CSV format)
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Define stopwords list
stop_words = set(stopwords.words('english'))

# Function for text cleaning and stemming
def clean_text(text):
  text = text.lower()  # Lowercase
  text = ''.join([char for char in text if char.isalnum() or char == ' '])  # Remove punctuation
  tokens = word_tokenize(text)  # Tokenize
  filtered_words = [w for w in tokens if w not in stop_words]  # Remove stopwords
  stemmer = PorterStemmer()  # Initialize Stemmer
  stemmed_words = [stemmer.stem(w) for w in filtered_words]  # Apply stemming
  return ' '.join(stemmed_words)

# Clean text in both datasets
train_data['text'] = train_data['text'].apply(clean_text)
test_data['text'] = test_data['text'].apply(clean_text)

# Feature Engineering with TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
train_features = vectorizer.fit_transform(train_data['text'])
test_features = vectorizer.transform(test_data['text'])

# Train-Test Split for validation (optional)
X_train, X_val, y_train, y_val = train_test_split(train_features, train_data['label'], test_size=0.2)

# Train a Naive Bayes model (replace with a different model if desired)
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluation on validation or test data
predictions = model.predict(test_features)

from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(test_data['label'], predictions)
f1 = f1_score(test_data['label'], predictions, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)


In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

def load_data(filepath):
  """Loads data from a CSV file."""
  return pd.read_csv(filepath)

def clean_text(text):
  """Performs text cleaning and stemming."""
  text = text.lower()  # Lowercase
  text = ''.join([char for char in text if char.isalnum() or char == ' '])  # Remove punctuation
  tokens = word_tokenize(text)
  filtered_words = [w for w in tokens if w not in stop_words]  # Remove stopwords
  stemmer = PorterStemmer()
  stemmed_words = [stemmer.stem(w) for w in filtered_words]  # Apply stemming
  return ' '.join(stemmed_words)

def augment_with_synonyms(stemmed_words):
  """Replaces some words with synonyms for data augmentation (optional)."""
  synonym_words = []
  for word in stemmed_words:
    synonyms = wordnet.synsets(word)
    if synonyms:
      synonym_words.append(synonyms[0].lemmas()[0].name())
    else:
      synonym_words.append(word)
  return synonym_words

def create_features(data, use_synonyms=False, ngram_range=(1, 1)):
  """Creates TF-IDF features from cleaned text."""
  stop_words = set(stopwords.words('english'))
  cleaned_text = data['text'].apply(clean_text)
  if use_synonyms:
    cleaned_text = cleaned_text.apply(lambda x: ' '.join(augment_with_synonyms(x.split())))
  vectorizer = TfidfVectorizer(max_features=1000, ngram_range=ngram_range)
  features = vectorizer.fit_transform(cleaned_text)
  return features

def evaluate_model(model_name, model, X_train, X_val, y_train, y_val):
  """Trains, evaluates, and prints performance of a model."""
  model.fit(X_train, y_train)
  predictions = model.predict(X_val)
  accuracy = accuracy_score(y_val, predictions)
  f1 = f1_score(y_val, predictions, average='weighted')
  print(f"Accuracy ({model_name}):", accuracy)
  print(f"F1 Score ({model_name}):", f1)

def main():
  """Loads data, performs preprocessing, trains and evaluates models."""
  train_data = load_data("train.csv")
  test_data = load_data("test.csv")

  # Feature Engineering (consider bigrams/trigrams)
  X_train = create_features(train_data)
  X_test = create_features(test_data)

  # Train-Test Split for validation
  X_train, X_val, y_train, y_val = train_test_split(X_train, train_data['label'], test_size=0.2)

  # Explore different models
  models = [
      ("MultinomialNB", MultinomialNB()),
      ("Random Forest", RandomForestClassifier())
  ]

  # Evaluate and select the best performing model
  best_model = None
  best_accuracy = 0
  for name, model in models:
    evaluate_model(name, model, X_train, X_val, y_train, y_val)
    if accuracy_score(y_val, model.predict(X_val)) > best_accuracy:
      best_model = model
      best_accuracy = accuracy_score(y_val, model.predict(X_val))

  # Final Evaluation on test data with best model
  predictions = best_model.predict(X_test)
  accuracy = accuracy_score(test_data['label'], predictions)
  f1 = f1_score(test_data['label'], predictions, average='weighted')
  print("Final Evaluation on Test Data:")
  print(f"Accuracy: {accuracy}")
  print(f"F1 Score: {f1}")

if __name__ == "__main__":
  main()


In [None]:
from sklearn.model_selection import GridSearchCV
from nltk.translate import Alignment, AlignedSent
from nltk.translate import IBMModel1
import numpy as np
from sklearn.model_selection import GridSearchCV
from googletrans import Translator
import random

# Initialize Google Translator
translator = Translator()

# Define a function for backtranslation augmentation
def backtranslate(text):
    if text is None:
        print("Error: Input text is None")
        return ""

    # Translate text to a random language
    random_lang = random.choice(['fr', 'es', 'de', 'it'])  # Select a random language
    print(f"Translating text to {random_lang}: {text}")
    
    try:
        translated_text = translator.translate(text, dest=random_lang).text
    except Exception as e:
        print(f"Error occurred during translation: {e}")
        return ""

    # Translate back to original language
    print(f"Translating back to English: {translated_text}")
    
    try:
        backtranslated_text = translator.translate(translated_text, dest='en').text
    except Exception as e:
        print(f"Error occurred during backtranslation: {e}")
        return ""

    return backtranslated_text



def augment_data(data):
    augmented_data = data.copy()
    # Augment data using backtranslation
    augmented_data['text'] = augmented_data['text'].apply(backtranslate)
    return augmented_data

def evaluate_with_grid_search(model_name, model, param_grid, X_train, X_val, y_train, y_val):
    """Performs grid search cross-validation and evaluates model."""
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    evaluate_model(model_name, best_model, X_val, y_val)

def evaluate_model(model_name, model, X_val, y_val):
    """Evaluates model."""
    predictions = model.predict(X_val)
    accuracy = accuracy_score(y_val, predictions)
    f1 = f1_score(y_val, predictions, average='weighted')
    print(f"Accuracy ({model_name}): {accuracy}")
    print(f"F1 Score ({model_name}): {f1}")

def main():
    """Loads data, performs preprocessing, trains and evaluates models."""
    train_data = load_data("train.csv")
    test_data = load_data("test.csv")
    
    # Take a smaller subset of the data for training
    train_data_subset = train_data.sample(frac=0.1, random_state=42)  # Adjust the fraction as needed

    # Augment training data
    augmented_train_data = augment_data(train_data_subset)

    # Feature Engineering
    X_train = create_features(augmented_train_data)
    X_test = create_features(test_data)

    # Train-Test Split for validation
    X_train, X_val, y_train, y_val = train_test_split(X_train, augmented_train_data['label'], test_size=0.2)

    # Define parameter grids for grid search
    nb_param_grid = {'alpha': [0.1, 0.5, 1.0]}
    rf_param_grid = {'n_estimators': [100, 200, 300]}

    # Models to evaluate
    models = [
        ("MultinomialNB", MultinomialNB(), nb_param_grid),
        ("Random Forest", RandomForestClassifier(), rf_param_grid)
    ]

    # Evaluate models with grid search
    for name, model, param_grid in models:
        evaluate_with_grid_search(name, model, param_grid, X_train, X_val, y_train, y_val)

    # Final Evaluation on test data
    best_model = models[0][1]  # Assuming the first model is the best based on validation results
    best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)
    accuracy = accuracy_score(test_data['label'], predictions)
    f1 = f1_score(test_data['label'], predictions, average='weighted')
    print("\nFinal Evaluation on Test Data:")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")

if __name__ == "__main__":
    main()

