In [56]:
import numpy as np
import json
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

In [57]:
def chunkify(lst, n):
    """Split a list into chunks of size n."""
    return [lst[i:i + n] for i in range(0, len(lst), n)]

def duplicate(machine_texts, human_texts):
    # Step 1: Determine the number of duplications
    num_duplications = len(machine_texts) // len(human_texts)

    print(num_duplications)
    
    # Step 2: Duplicate the human texts
    expanded_human_texts = human_texts * num_duplications
    
    print(len(expanded_human_texts))

    # Step 3: Sample the remaining number of human texts if needed
    remaining_texts_needed = len(machine_texts) - len(expanded_human_texts)
    
    print(remaining_texts_needed)
    
    expanded_human_texts += random.sample(human_texts, remaining_texts_needed)
    

    # Step 4: Merge and shuffle the lists
    print("human: " + str(len(expanded_human_texts)) + " mahcine: " + str(len(machine_texts)))
    
    balanced_texts = expanded_human_texts + machine_texts
    
    print(f"balanced_texts {len(balanced_texts)}")
    random.shuffle(balanced_texts)
    
    return balanced_texts


train_data_1 = []

with open('domain1_train.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        train_data_1.append(data)

train_data_2_human = []
train_data_2_machine = []
with open('domain2_train.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        if data["label"] == 0: 
            train_data_2_human.append(data)
        else:
            train_data_2_machine.append(data)
# print(len())
print(len(train_data_2_human))
print(len(train_data_2_machine))
balaned_data_2 = duplicate(train_data_2_machine, train_data_2_human)
print(len(balaned_data_2))

train_data = train_data_1 + balaned_data_2

12750
2150
0
0
2150
human: 2150 mahcine: 2150
balanced_texts 4300
4300


In [58]:
X = [entry["text"] for entry in train_data]
y = [entry["label"] for entry in train_data]

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [60]:
# BOW n-grams
#vectorizer = HashingVectorizer(n_features=10000)
vectorizer = CountVectorizer(ngram_range=(1,2))

X_train_text = [' '.join(map(str, sample)) for sample in X_train]
X_test_text = [' '.join(map(str, sample)) for sample in X_test]

X_train_transformed = vectorizer.fit_transform(X_train_text)
X_test_transformed = vectorizer.transform(X_test_text)

In [61]:
logistic_model = LogisticRegression(
    penalty="l2",
    max_iter=1000,
)

In [62]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(logistic_model, param_grid, cv=10)
grid_search.fit(X_train_transformed, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_transformed)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")
# kaggle结果76.8%

Test Accuracy: 0.8689075630252101


In [63]:
with open('logistic_output_cross_validation.csv', 'w') as output_file:
    output_file.write('id,class\n')  

    # read in dataset
    with open('test_set.json', 'r') as file:
        for line in file:
            entry = json.loads(line)
            text = entry["text"]

            # word embedding
            X_test = vectorizer.transform([" ".join(map(str, text))])

            # use logistic model to predict
            prediction = best_model.predict(X_test)

            # output csv
            output_file.write(f"{entry['id']},{prediction[0]}\n")

In [55]:
# Write data to a file named "fruits.txt"
with open("test/0.78", "w") as file:
    for item in balaned_data_2:
        file.write(str(item) + "\n")